1 // SPDX-License-Identifier: GPL-2.0-only 1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 2 /* 3 * fs/dax.c - Direct Access filesystem code 3 * fs/dax.c - Direct Access filesystem code 4 * Copyright (c) 2013-2014 Intel Corporation 4 * Copyright (c) 2013-2014 Intel Corporation 5 * Author: Matthew Wilcox <matthew.r.wilcox@in 5 * Author: Matthew Wilcox <matthew.r.wilcox@intel.com> 6 * Author: Ross Zwisler <ross.zwisler@linux.in 6 * Author: Ross Zwisler <ross.zwisler@linux.intel.com> 7 */ 7 */ 8 8 9 #include <linux/atomic.h> 9 #include <linux/atomic.h> 10 #include <linux/blkdev.h> 10 #include <linux/blkdev.h> 11 #include <linux/buffer_head.h> 11 #include <linux/buffer_head.h> 12 #include <linux/dax.h> 12 #include <linux/dax.h> 13 #include <linux/fs.h> 13 #include <linux/fs.h> >> 14 #include <linux/genhd.h> 14 #include <linux/highmem.h> 15 #include <linux/highmem.h> 15 #include <linux/memcontrol.h> 16 #include <linux/memcontrol.h> 16 #include <linux/mm.h> 17 #include <linux/mm.h> 17 #include <linux/mutex.h> 18 #include <linux/mutex.h> 18 #include <linux/pagevec.h> 19 #include <linux/pagevec.h> 19 #include <linux/sched.h> 20 #include <linux/sched.h> 20 #include <linux/sched/signal.h> 21 #include <linux/sched/signal.h> 21 #include <linux/uio.h> 22 #include <linux/uio.h> 22 #include <linux/vmstat.h> 23 #include <linux/vmstat.h> 23 #include <linux/pfn_t.h> 24 #include <linux/pfn_t.h> 24 #include <linux/sizes.h> 25 #include <linux/sizes.h> 25 #include <linux/mmu_notifier.h> 26 #include <linux/mmu_notifier.h> 26 #include <linux/iomap.h> 27 #include <linux/iomap.h> 27 #include <linux/rmap.h> << 28 #include <asm/pgalloc.h> 28 #include <asm/pgalloc.h> 29 29 30 #define CREATE_TRACE_POINTS 30 #define CREATE_TRACE_POINTS 31 #include <trace/events/fs_dax.h> 31 #include <trace/events/fs_dax.h> 32 32 >> 33 static inline unsigned int pe_order(enum page_entry_size pe_size) >> 34 { >> 35 if (pe_size == PE_SIZE_PTE) >> 36 return PAGE_SHIFT - PAGE_SHIFT; >> 37 if (pe_size == PE_SIZE_PMD) >> 38 return PMD_SHIFT - PAGE_SHIFT; >> 39 if (pe_size == PE_SIZE_PUD) >> 40 return PUD_SHIFT - PAGE_SHIFT; >> 41 return ~0; >> 42 } >> 43 33 /* We choose 4096 entries - same as per-zone p 44 /* We choose 4096 entries - same as per-zone page wait tables */ 34 #define DAX_WAIT_TABLE_BITS 12 45 #define DAX_WAIT_TABLE_BITS 12 35 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_ 46 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS) 36 47 37 /* The 'colour' (ie low bits) within a PMD of 48 /* The 'colour' (ie low bits) within a PMD of a page offset. */ 38 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHI 49 #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) 39 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIF 50 #define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) 40 51 >> 52 /* The order of a PMD entry */ >> 53 #define PMD_ORDER (PMD_SHIFT - PAGE_SHIFT) >> 54 41 static wait_queue_head_t wait_table[DAX_WAIT_T 55 static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; 42 56 43 static int __init init_dax_wait_table(void) 57 static int __init init_dax_wait_table(void) 44 { 58 { 45 int i; 59 int i; 46 60 47 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES 61 for (i = 0; i < DAX_WAIT_TABLE_ENTRIES; i++) 48 init_waitqueue_head(wait_table 62 init_waitqueue_head(wait_table + i); 49 return 0; 63 return 0; 50 } 64 } 51 fs_initcall(init_dax_wait_table); 65 fs_initcall(init_dax_wait_table); 52 66 53 /* 67 /* 54 * DAX pagecache entries use XArray value entr 68 * DAX pagecache entries use XArray value entries so they can't be mistaken 55 * for pages. We use one bit for locking, one 69 * for pages. We use one bit for locking, one bit for the entry size (PMD) 56 * and two more to tell us if the entry is a z 70 * and two more to tell us if the entry is a zero page or an empty entry that 57 * is just used for locking. In total four sp 71 * is just used for locking. In total four special bits. 58 * 72 * 59 * If the PMD bit isn't set the entry has size 73 * If the PMD bit isn't set the entry has size PAGE_SIZE, and if the ZERO_PAGE 60 * and EMPTY bits aren't set the entry is a no 74 * and EMPTY bits aren't set the entry is a normal DAX entry with a filesystem 61 * block allocation. 75 * block allocation. 62 */ 76 */ 63 #define DAX_SHIFT (4) 77 #define DAX_SHIFT (4) 64 #define DAX_LOCKED (1UL << 0) 78 #define DAX_LOCKED (1UL << 0) 65 #define DAX_PMD (1UL << 1) 79 #define DAX_PMD (1UL << 1) 66 #define DAX_ZERO_PAGE (1UL << 2) 80 #define DAX_ZERO_PAGE (1UL << 2) 67 #define DAX_EMPTY (1UL << 3) 81 #define DAX_EMPTY (1UL << 3) 68 82 69 static unsigned long dax_to_pfn(void *entry) 83 static unsigned long dax_to_pfn(void *entry) 70 { 84 { 71 return xa_to_value(entry) >> DAX_SHIFT 85 return xa_to_value(entry) >> DAX_SHIFT; 72 } 86 } 73 87 74 static void *dax_make_entry(pfn_t pfn, unsigne 88 static void *dax_make_entry(pfn_t pfn, unsigned long flags) 75 { 89 { 76 return xa_mk_value(flags | (pfn_t_to_p 90 return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); 77 } 91 } 78 92 79 static bool dax_is_locked(void *entry) 93 static bool dax_is_locked(void *entry) 80 { 94 { 81 return xa_to_value(entry) & DAX_LOCKED 95 return xa_to_value(entry) & DAX_LOCKED; 82 } 96 } 83 97 84 static unsigned int dax_entry_order(void *entr 98 static unsigned int dax_entry_order(void *entry) 85 { 99 { 86 if (xa_to_value(entry) & DAX_PMD) 100 if (xa_to_value(entry) & DAX_PMD) 87 return PMD_ORDER; 101 return PMD_ORDER; 88 return 0; 102 return 0; 89 } 103 } 90 104 91 static unsigned long dax_is_pmd_entry(void *en 105 static unsigned long dax_is_pmd_entry(void *entry) 92 { 106 { 93 return xa_to_value(entry) & DAX_PMD; 107 return xa_to_value(entry) & DAX_PMD; 94 } 108 } 95 109 96 static bool dax_is_pte_entry(void *entry) 110 static bool dax_is_pte_entry(void *entry) 97 { 111 { 98 return !(xa_to_value(entry) & DAX_PMD) 112 return !(xa_to_value(entry) & DAX_PMD); 99 } 113 } 100 114 101 static int dax_is_zero_entry(void *entry) 115 static int dax_is_zero_entry(void *entry) 102 { 116 { 103 return xa_to_value(entry) & DAX_ZERO_P 117 return xa_to_value(entry) & DAX_ZERO_PAGE; 104 } 118 } 105 119 106 static int dax_is_empty_entry(void *entry) 120 static int dax_is_empty_entry(void *entry) 107 { 121 { 108 return xa_to_value(entry) & DAX_EMPTY; 122 return xa_to_value(entry) & DAX_EMPTY; 109 } 123 } 110 124 111 /* 125 /* 112 * true if the entry that was found is of a sm 126 * true if the entry that was found is of a smaller order than the entry 113 * we were looking for 127 * we were looking for 114 */ 128 */ 115 static bool dax_is_conflict(void *entry) 129 static bool dax_is_conflict(void *entry) 116 { 130 { 117 return entry == XA_RETRY_ENTRY; 131 return entry == XA_RETRY_ENTRY; 118 } 132 } 119 133 120 /* 134 /* 121 * DAX page cache entry locking 135 * DAX page cache entry locking 122 */ 136 */ 123 struct exceptional_entry_key { 137 struct exceptional_entry_key { 124 struct xarray *xa; 138 struct xarray *xa; 125 pgoff_t entry_start; 139 pgoff_t entry_start; 126 }; 140 }; 127 141 128 struct wait_exceptional_entry_queue { 142 struct wait_exceptional_entry_queue { 129 wait_queue_entry_t wait; 143 wait_queue_entry_t wait; 130 struct exceptional_entry_key key; 144 struct exceptional_entry_key key; 131 }; 145 }; 132 146 133 /** << 134 * enum dax_wake_mode: waitqueue wakeup behavi << 135 * @WAKE_ALL: wake all waiters in the waitqueu << 136 * @WAKE_NEXT: wake only the first waiter in t << 137 */ << 138 enum dax_wake_mode { << 139 WAKE_ALL, << 140 WAKE_NEXT, << 141 }; << 142 << 143 static wait_queue_head_t *dax_entry_waitqueue( 147 static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, 144 void *entry, struct exceptiona 148 void *entry, struct exceptional_entry_key *key) 145 { 149 { 146 unsigned long hash; 150 unsigned long hash; 147 unsigned long index = xas->xa_index; 151 unsigned long index = xas->xa_index; 148 152 149 /* 153 /* 150 * If 'entry' is a PMD, align the 'ind 154 * If 'entry' is a PMD, align the 'index' that we use for the wait 151 * queue to the start of that PMD. Th 155 * queue to the start of that PMD. This ensures that all offsets in 152 * the range covered by the PMD map to 156 * the range covered by the PMD map to the same bit lock. 153 */ 157 */ 154 if (dax_is_pmd_entry(entry)) 158 if (dax_is_pmd_entry(entry)) 155 index &= ~PG_PMD_COLOUR; 159 index &= ~PG_PMD_COLOUR; 156 key->xa = xas->xa; 160 key->xa = xas->xa; 157 key->entry_start = index; 161 key->entry_start = index; 158 162 159 hash = hash_long((unsigned long)xas->x 163 hash = hash_long((unsigned long)xas->xa ^ index, DAX_WAIT_TABLE_BITS); 160 return wait_table + hash; 164 return wait_table + hash; 161 } 165 } 162 166 163 static int wake_exceptional_entry_func(wait_qu 167 static int wake_exceptional_entry_func(wait_queue_entry_t *wait, 164 unsigned int mode, int sync, v 168 unsigned int mode, int sync, void *keyp) 165 { 169 { 166 struct exceptional_entry_key *key = ke 170 struct exceptional_entry_key *key = keyp; 167 struct wait_exceptional_entry_queue *e 171 struct wait_exceptional_entry_queue *ewait = 168 container_of(wait, struct wait 172 container_of(wait, struct wait_exceptional_entry_queue, wait); 169 173 170 if (key->xa != ewait->key.xa || 174 if (key->xa != ewait->key.xa || 171 key->entry_start != ewait->key.ent 175 key->entry_start != ewait->key.entry_start) 172 return 0; 176 return 0; 173 return autoremove_wake_function(wait, 177 return autoremove_wake_function(wait, mode, sync, NULL); 174 } 178 } 175 179 176 /* 180 /* 177 * @entry may no longer be the entry at the in 181 * @entry may no longer be the entry at the index in the mapping. 178 * The important information it's conveying is 182 * The important information it's conveying is whether the entry at 179 * this index used to be a PMD entry. 183 * this index used to be a PMD entry. 180 */ 184 */ 181 static void dax_wake_entry(struct xa_state *xa !! 185 static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) 182 enum dax_wake_mode << 183 { 186 { 184 struct exceptional_entry_key key; 187 struct exceptional_entry_key key; 185 wait_queue_head_t *wq; 188 wait_queue_head_t *wq; 186 189 187 wq = dax_entry_waitqueue(xas, entry, & 190 wq = dax_entry_waitqueue(xas, entry, &key); 188 191 189 /* 192 /* 190 * Checking for locked entry and prepa 193 * Checking for locked entry and prepare_to_wait_exclusive() happens 191 * under the i_pages lock, ditto for e 194 * under the i_pages lock, ditto for entry handling in our callers. 192 * So at this point all tasks that cou 195 * So at this point all tasks that could have seen our entry locked 193 * must be in the waitqueue and the fo 196 * must be in the waitqueue and the following check will see them. 194 */ 197 */ 195 if (waitqueue_active(wq)) 198 if (waitqueue_active(wq)) 196 __wake_up(wq, TASK_NORMAL, mod !! 199 __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); 197 } 200 } 198 201 199 /* 202 /* 200 * Look up entry in page cache, wait for it to 203 * Look up entry in page cache, wait for it to become unlocked if it 201 * is a DAX entry and return it. The caller m 204 * is a DAX entry and return it. The caller must subsequently call 202 * put_unlocked_entry() if it did not lock the 205 * put_unlocked_entry() if it did not lock the entry or dax_unlock_entry() 203 * if it did. The entry returned may have a l 206 * if it did. The entry returned may have a larger order than @order. 204 * If @order is larger than the order of the e 207 * If @order is larger than the order of the entry found in i_pages, this 205 * function returns a dax_is_conflict entry. 208 * function returns a dax_is_conflict entry. 206 * 209 * 207 * Must be called with the i_pages lock held. 210 * Must be called with the i_pages lock held. 208 */ 211 */ 209 static void *get_unlocked_entry(struct xa_stat 212 static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) 210 { 213 { 211 void *entry; 214 void *entry; 212 struct wait_exceptional_entry_queue ew 215 struct wait_exceptional_entry_queue ewait; 213 wait_queue_head_t *wq; 216 wait_queue_head_t *wq; 214 217 215 init_wait(&ewait.wait); 218 init_wait(&ewait.wait); 216 ewait.wait.func = wake_exceptional_ent 219 ewait.wait.func = wake_exceptional_entry_func; 217 220 218 for (;;) { 221 for (;;) { 219 entry = xas_find_conflict(xas) 222 entry = xas_find_conflict(xas); 220 if (!entry || WARN_ON_ONCE(!xa 223 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 221 return entry; 224 return entry; 222 if (dax_entry_order(entry) < o 225 if (dax_entry_order(entry) < order) 223 return XA_RETRY_ENTRY; 226 return XA_RETRY_ENTRY; 224 if (!dax_is_locked(entry)) 227 if (!dax_is_locked(entry)) 225 return entry; 228 return entry; 226 229 227 wq = dax_entry_waitqueue(xas, 230 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 228 prepare_to_wait_exclusive(wq, 231 prepare_to_wait_exclusive(wq, &ewait.wait, 229 TASK 232 TASK_UNINTERRUPTIBLE); 230 xas_unlock_irq(xas); 233 xas_unlock_irq(xas); 231 xas_reset(xas); 234 xas_reset(xas); 232 schedule(); 235 schedule(); 233 finish_wait(wq, &ewait.wait); 236 finish_wait(wq, &ewait.wait); 234 xas_lock_irq(xas); 237 xas_lock_irq(xas); 235 } 238 } 236 } 239 } 237 240 238 /* 241 /* 239 * The only thing keeping the address space ar 242 * The only thing keeping the address space around is the i_pages lock 240 * (it's cycled in clear_inode() after removin 243 * (it's cycled in clear_inode() after removing the entries from i_pages) 241 * After we call xas_unlock_irq(), we cannot t 244 * After we call xas_unlock_irq(), we cannot touch xas->xa. 242 */ 245 */ 243 static void wait_entry_unlocked(struct xa_stat 246 static void wait_entry_unlocked(struct xa_state *xas, void *entry) 244 { 247 { 245 struct wait_exceptional_entry_queue ew 248 struct wait_exceptional_entry_queue ewait; 246 wait_queue_head_t *wq; 249 wait_queue_head_t *wq; 247 250 248 init_wait(&ewait.wait); 251 init_wait(&ewait.wait); 249 ewait.wait.func = wake_exceptional_ent 252 ewait.wait.func = wake_exceptional_entry_func; 250 253 251 wq = dax_entry_waitqueue(xas, entry, & 254 wq = dax_entry_waitqueue(xas, entry, &ewait.key); 252 /* 255 /* 253 * Unlike get_unlocked_entry() there i 256 * Unlike get_unlocked_entry() there is no guarantee that this 254 * path ever successfully retrieves an 257 * path ever successfully retrieves an unlocked entry before an 255 * inode dies. Perform a non-exclusive 258 * inode dies. Perform a non-exclusive wait in case this path 256 * never successfully performs its own 259 * never successfully performs its own wake up. 257 */ 260 */ 258 prepare_to_wait(wq, &ewait.wait, TASK_ 261 prepare_to_wait(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); 259 xas_unlock_irq(xas); 262 xas_unlock_irq(xas); 260 schedule(); 263 schedule(); 261 finish_wait(wq, &ewait.wait); 264 finish_wait(wq, &ewait.wait); 262 } 265 } 263 266 264 static void put_unlocked_entry(struct xa_state !! 267 static void put_unlocked_entry(struct xa_state *xas, void *entry) 265 enum dax_wake_m << 266 { 268 { >> 269 /* If we were the only waiter woken, wake the next one */ 267 if (entry && !dax_is_conflict(entry)) 270 if (entry && !dax_is_conflict(entry)) 268 dax_wake_entry(xas, entry, mod !! 271 dax_wake_entry(xas, entry, false); 269 } 272 } 270 273 271 /* 274 /* 272 * We used the xa_state to get the entry, but 275 * We used the xa_state to get the entry, but then we locked the entry and 273 * dropped the xa_lock, so we know the xa_stat 276 * dropped the xa_lock, so we know the xa_state is stale and must be reset 274 * before use. 277 * before use. 275 */ 278 */ 276 static void dax_unlock_entry(struct xa_state * 279 static void dax_unlock_entry(struct xa_state *xas, void *entry) 277 { 280 { 278 void *old; 281 void *old; 279 282 280 BUG_ON(dax_is_locked(entry)); 283 BUG_ON(dax_is_locked(entry)); 281 xas_reset(xas); 284 xas_reset(xas); 282 xas_lock_irq(xas); 285 xas_lock_irq(xas); 283 old = xas_store(xas, entry); 286 old = xas_store(xas, entry); 284 xas_unlock_irq(xas); 287 xas_unlock_irq(xas); 285 BUG_ON(!dax_is_locked(old)); 288 BUG_ON(!dax_is_locked(old)); 286 dax_wake_entry(xas, entry, WAKE_NEXT); !! 289 dax_wake_entry(xas, entry, false); 287 } 290 } 288 291 289 /* 292 /* 290 * Return: The entry stored at this location b 293 * Return: The entry stored at this location before it was locked. 291 */ 294 */ 292 static void *dax_lock_entry(struct xa_state *x 295 static void *dax_lock_entry(struct xa_state *xas, void *entry) 293 { 296 { 294 unsigned long v = xa_to_value(entry); 297 unsigned long v = xa_to_value(entry); 295 return xas_store(xas, xa_mk_value(v | 298 return xas_store(xas, xa_mk_value(v | DAX_LOCKED)); 296 } 299 } 297 300 298 static unsigned long dax_entry_size(void *entr 301 static unsigned long dax_entry_size(void *entry) 299 { 302 { 300 if (dax_is_zero_entry(entry)) 303 if (dax_is_zero_entry(entry)) 301 return 0; 304 return 0; 302 else if (dax_is_empty_entry(entry)) 305 else if (dax_is_empty_entry(entry)) 303 return 0; 306 return 0; 304 else if (dax_is_pmd_entry(entry)) 307 else if (dax_is_pmd_entry(entry)) 305 return PMD_SIZE; 308 return PMD_SIZE; 306 else 309 else 307 return PAGE_SIZE; 310 return PAGE_SIZE; 308 } 311 } 309 312 310 static unsigned long dax_end_pfn(void *entry) 313 static unsigned long dax_end_pfn(void *entry) 311 { 314 { 312 return dax_to_pfn(entry) + dax_entry_s 315 return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; 313 } 316 } 314 317 315 /* 318 /* 316 * Iterate through all mapped pfns represented 319 * Iterate through all mapped pfns represented by an entry, i.e. skip 317 * 'empty' and 'zero' entries. 320 * 'empty' and 'zero' entries. 318 */ 321 */ 319 #define for_each_mapped_pfn(entry, pfn) \ 322 #define for_each_mapped_pfn(entry, pfn) \ 320 for (pfn = dax_to_pfn(entry); \ 323 for (pfn = dax_to_pfn(entry); \ 321 pfn < dax_end_pfn(entr 324 pfn < dax_end_pfn(entry); pfn++) 322 325 323 static inline bool dax_page_is_shared(struct p << 324 { << 325 return page->mapping == PAGE_MAPPING_D << 326 } << 327 << 328 /* << 329 * Set the page->mapping with PAGE_MAPPING_DAX << 330 * refcount. << 331 */ << 332 static inline void dax_page_share_get(struct p << 333 { << 334 if (page->mapping != PAGE_MAPPING_DAX_ << 335 /* << 336 * Reset the index if the page << 337 * regularly before. << 338 */ << 339 if (page->mapping) << 340 page->share = 1; << 341 page->mapping = PAGE_MAPPING_D << 342 } << 343 page->share++; << 344 } << 345 << 346 static inline unsigned long dax_page_share_put << 347 { << 348 return --page->share; << 349 } << 350 << 351 /* 326 /* 352 * When it is called in dax_insert_entry(), th !! 327 * TODO: for reflink+dax we need a way to associate a single page with 353 * whether this entry is shared by multiple fi !! 328 * multiple address_space instances at different linear_page_index() 354 * PAGE_MAPPING_DAX_SHARED, and use page->shar !! 329 * offsets. 355 */ 330 */ 356 static void dax_associate_entry(void *entry, s 331 static void dax_associate_entry(void *entry, struct address_space *mapping, 357 struct vm_area_struct *vma, un !! 332 struct vm_area_struct *vma, unsigned long address) 358 { 333 { 359 unsigned long size = dax_entry_size(en 334 unsigned long size = dax_entry_size(entry), pfn, index; 360 int i = 0; 335 int i = 0; 361 336 362 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 337 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 363 return; 338 return; 364 339 365 index = linear_page_index(vma, address 340 index = linear_page_index(vma, address & ~(size - 1)); 366 for_each_mapped_pfn(entry, pfn) { 341 for_each_mapped_pfn(entry, pfn) { 367 struct page *page = pfn_to_pag 342 struct page *page = pfn_to_page(pfn); 368 343 369 if (shared) { !! 344 WARN_ON_ONCE(page->mapping); 370 dax_page_share_get(pag !! 345 page->mapping = mapping; 371 } else { !! 346 page->index = index + i++; 372 WARN_ON_ONCE(page->map << 373 page->mapping = mappin << 374 page->index = index + << 375 } << 376 } 347 } 377 } 348 } 378 349 379 static void dax_disassociate_entry(void *entry 350 static void dax_disassociate_entry(void *entry, struct address_space *mapping, 380 bool trunc) 351 bool trunc) 381 { 352 { 382 unsigned long pfn; 353 unsigned long pfn; 383 354 384 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 355 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 385 return; 356 return; 386 357 387 for_each_mapped_pfn(entry, pfn) { 358 for_each_mapped_pfn(entry, pfn) { 388 struct page *page = pfn_to_pag 359 struct page *page = pfn_to_page(pfn); 389 360 390 WARN_ON_ONCE(trunc && page_ref 361 WARN_ON_ONCE(trunc && page_ref_count(page) > 1); 391 if (dax_page_is_shared(page)) !! 362 WARN_ON_ONCE(page->mapping && page->mapping != mapping); 392 /* keep the shared fla << 393 if (dax_page_share_put << 394 continue; << 395 } else << 396 WARN_ON_ONCE(page->map << 397 page->mapping = NULL; 363 page->mapping = NULL; 398 page->index = 0; 364 page->index = 0; 399 } 365 } 400 } 366 } 401 367 402 static struct page *dax_busy_page(void *entry) 368 static struct page *dax_busy_page(void *entry) 403 { 369 { 404 unsigned long pfn; 370 unsigned long pfn; 405 371 406 for_each_mapped_pfn(entry, pfn) { 372 for_each_mapped_pfn(entry, pfn) { 407 struct page *page = pfn_to_pag 373 struct page *page = pfn_to_page(pfn); 408 374 409 if (page_ref_count(page) > 1) 375 if (page_ref_count(page) > 1) 410 return page; 376 return page; 411 } 377 } 412 return NULL; 378 return NULL; 413 } 379 } 414 380 415 /** !! 381 /* 416 * dax_lock_folio - Lock the DAX entry corresp !! 382 * dax_lock_mapping_entry - Lock the DAX entry corresponding to a page 417 * @folio: The folio whose entry we want to lo !! 383 * @page: The page whose entry we want to lock 418 * 384 * 419 * Context: Process context. 385 * Context: Process context. 420 * Return: A cookie to pass to dax_unlock_foli !! 386 * Return: A cookie to pass to dax_unlock_page() or 0 if the entry could 421 * not be locked. 387 * not be locked. 422 */ 388 */ 423 dax_entry_t dax_lock_folio(struct folio *folio !! 389 dax_entry_t dax_lock_page(struct page *page) 424 { 390 { 425 XA_STATE(xas, NULL, 0); 391 XA_STATE(xas, NULL, 0); 426 void *entry; 392 void *entry; 427 393 428 /* Ensure folio->mapping isn't freed w !! 394 /* Ensure page->mapping isn't freed while we look at it */ 429 rcu_read_lock(); 395 rcu_read_lock(); 430 for (;;) { 396 for (;;) { 431 struct address_space *mapping !! 397 struct address_space *mapping = READ_ONCE(page->mapping); 432 398 433 entry = NULL; 399 entry = NULL; 434 if (!mapping || !dax_mapping(m 400 if (!mapping || !dax_mapping(mapping)) 435 break; 401 break; 436 402 437 /* 403 /* 438 * In the device-dax case ther 404 * In the device-dax case there's no need to lock, a 439 * struct dev_pagemap pin is s 405 * struct dev_pagemap pin is sufficient to keep the 440 * inode alive, and we assume 406 * inode alive, and we assume we have dev_pagemap pin 441 * otherwise we would not have 407 * otherwise we would not have a valid pfn_to_page() 442 * translation. 408 * translation. 443 */ 409 */ 444 entry = (void *)~0UL; 410 entry = (void *)~0UL; 445 if (S_ISCHR(mapping->host->i_m 411 if (S_ISCHR(mapping->host->i_mode)) 446 break; 412 break; 447 413 448 xas.xa = &mapping->i_pages; 414 xas.xa = &mapping->i_pages; 449 xas_lock_irq(&xas); 415 xas_lock_irq(&xas); 450 if (mapping != folio->mapping) !! 416 if (mapping != page->mapping) { 451 xas_unlock_irq(&xas); 417 xas_unlock_irq(&xas); 452 continue; 418 continue; 453 } 419 } 454 xas_set(&xas, folio->index); !! 420 xas_set(&xas, page->index); 455 entry = xas_load(&xas); 421 entry = xas_load(&xas); 456 if (dax_is_locked(entry)) { 422 if (dax_is_locked(entry)) { 457 rcu_read_unlock(); 423 rcu_read_unlock(); 458 wait_entry_unlocked(&x 424 wait_entry_unlocked(&xas, entry); 459 rcu_read_lock(); 425 rcu_read_lock(); 460 continue; 426 continue; 461 } 427 } 462 dax_lock_entry(&xas, entry); 428 dax_lock_entry(&xas, entry); 463 xas_unlock_irq(&xas); 429 xas_unlock_irq(&xas); 464 break; 430 break; 465 } 431 } 466 rcu_read_unlock(); 432 rcu_read_unlock(); 467 return (dax_entry_t)entry; 433 return (dax_entry_t)entry; 468 } 434 } 469 435 470 void dax_unlock_folio(struct folio *folio, dax !! 436 void dax_unlock_page(struct page *page, dax_entry_t cookie) 471 { 437 { 472 struct address_space *mapping = folio- !! 438 struct address_space *mapping = page->mapping; 473 XA_STATE(xas, &mapping->i_pages, folio !! 439 XA_STATE(xas, &mapping->i_pages, page->index); 474 440 475 if (S_ISCHR(mapping->host->i_mode)) 441 if (S_ISCHR(mapping->host->i_mode)) 476 return; 442 return; 477 443 478 dax_unlock_entry(&xas, (void *)cookie) 444 dax_unlock_entry(&xas, (void *)cookie); 479 } 445 } 480 446 481 /* 447 /* 482 * dax_lock_mapping_entry - Lock the DAX entry << 483 * @mapping: the file's mapping whose entry we << 484 * @index: the offset within this file << 485 * @page: output the dax page corresponding to << 486 * << 487 * Return: A cookie to pass to dax_unlock_mapp << 488 * could not be locked. << 489 */ << 490 dax_entry_t dax_lock_mapping_entry(struct addr << 491 struct page **page) << 492 { << 493 XA_STATE(xas, NULL, 0); << 494 void *entry; << 495 << 496 rcu_read_lock(); << 497 for (;;) { << 498 entry = NULL; << 499 if (!dax_mapping(mapping)) << 500 break; << 501 << 502 xas.xa = &mapping->i_pages; << 503 xas_lock_irq(&xas); << 504 xas_set(&xas, index); << 505 entry = xas_load(&xas); << 506 if (dax_is_locked(entry)) { << 507 rcu_read_unlock(); << 508 wait_entry_unlocked(&x << 509 rcu_read_lock(); << 510 continue; << 511 } << 512 if (!entry || << 513 dax_is_zero_entry(entry) | << 514 /* << 515 * Because we are look << 516 * and index, so the e << 517 * or even a zero/empt << 518 * an error case. So, << 519 * not output @page. << 520 */ << 521 entry = (void *)~0UL; << 522 } else { << 523 *page = pfn_to_page(da << 524 dax_lock_entry(&xas, e << 525 } << 526 xas_unlock_irq(&xas); << 527 break; << 528 } << 529 rcu_read_unlock(); << 530 return (dax_entry_t)entry; << 531 } << 532 << 533 void dax_unlock_mapping_entry(struct address_s << 534 dax_entry_t cookie) << 535 { << 536 XA_STATE(xas, &mapping->i_pages, index << 537 << 538 if (cookie == ~0UL) << 539 return; << 540 << 541 dax_unlock_entry(&xas, (void *)cookie) << 542 } << 543 << 544 /* << 545 * Find page cache entry at given index. If it 448 * Find page cache entry at given index. If it is a DAX entry, return it 546 * with the entry locked. If the page cache do 449 * with the entry locked. If the page cache doesn't contain an entry at 547 * that index, add a locked empty entry. 450 * that index, add a locked empty entry. 548 * 451 * 549 * When requesting an entry with size DAX_PMD, 452 * When requesting an entry with size DAX_PMD, grab_mapping_entry() will 550 * either return that locked entry or will ret 453 * either return that locked entry or will return VM_FAULT_FALLBACK. 551 * This will happen if there are any PTE entri 454 * This will happen if there are any PTE entries within the PMD range 552 * that we are requesting. 455 * that we are requesting. 553 * 456 * 554 * We always favor PTE entries over PMD entrie 457 * We always favor PTE entries over PMD entries. There isn't a flow where we 555 * evict PTE entries in order to 'upgrade' the 458 * evict PTE entries in order to 'upgrade' them to a PMD entry. A PMD 556 * insertion will fail if it finds any PTE ent 459 * insertion will fail if it finds any PTE entries already in the tree, and a 557 * PTE insertion will cause an existing PMD en 460 * PTE insertion will cause an existing PMD entry to be unmapped and 558 * downgraded to PTE entries. This happens fo 461 * downgraded to PTE entries. This happens for both PMD zero pages as 559 * well as PMD empty entries. 462 * well as PMD empty entries. 560 * 463 * 561 * The exception to this downgrade path is for 464 * The exception to this downgrade path is for PMD entries that have 562 * real storage backing them. We will leave t 465 * real storage backing them. We will leave these real PMD entries in 563 * the tree, and PTE writes will simply dirty 466 * the tree, and PTE writes will simply dirty the entire PMD entry. 564 * 467 * 565 * Note: Unlike filemap_fault() we don't honor 468 * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For 566 * persistent memory the benefit is doubtful. 469 * persistent memory the benefit is doubtful. We can add that later if we can 567 * show it helps. 470 * show it helps. 568 * 471 * 569 * On error, this function does not return an 472 * On error, this function does not return an ERR_PTR. Instead it returns 570 * a VM_FAULT code, encoded as an xarray inter 473 * a VM_FAULT code, encoded as an xarray internal entry. The ERR_PTR values 571 * overlap with xarray value entries. 474 * overlap with xarray value entries. 572 */ 475 */ 573 static void *grab_mapping_entry(struct xa_stat 476 static void *grab_mapping_entry(struct xa_state *xas, 574 struct address_space *mapping, 477 struct address_space *mapping, unsigned int order) 575 { 478 { 576 unsigned long index = xas->xa_index; 479 unsigned long index = xas->xa_index; 577 bool pmd_downgrade; /* splitting P !! 480 bool pmd_downgrade = false; /* splitting PMD entry into PTE entries? */ 578 void *entry; 481 void *entry; 579 482 580 retry: 483 retry: 581 pmd_downgrade = false; << 582 xas_lock_irq(xas); 484 xas_lock_irq(xas); 583 entry = get_unlocked_entry(xas, order) 485 entry = get_unlocked_entry(xas, order); 584 486 585 if (entry) { 487 if (entry) { 586 if (dax_is_conflict(entry)) 488 if (dax_is_conflict(entry)) 587 goto fallback; 489 goto fallback; 588 if (!xa_is_value(entry)) { 490 if (!xa_is_value(entry)) { 589 xas_set_err(xas, -EIO) !! 491 xas_set_err(xas, EIO); 590 goto out_unlock; 492 goto out_unlock; 591 } 493 } 592 494 593 if (order == 0) { 495 if (order == 0) { 594 if (dax_is_pmd_entry(e 496 if (dax_is_pmd_entry(entry) && 595 (dax_is_zero_entry 497 (dax_is_zero_entry(entry) || 596 dax_is_empty_entr 498 dax_is_empty_entry(entry))) { 597 pmd_downgrade 499 pmd_downgrade = true; 598 } 500 } 599 } 501 } 600 } 502 } 601 503 602 if (pmd_downgrade) { 504 if (pmd_downgrade) { 603 /* 505 /* 604 * Make sure 'entry' remains v 506 * Make sure 'entry' remains valid while we drop 605 * the i_pages lock. 507 * the i_pages lock. 606 */ 508 */ 607 dax_lock_entry(xas, entry); 509 dax_lock_entry(xas, entry); 608 510 609 /* 511 /* 610 * Besides huge zero pages the 512 * Besides huge zero pages the only other thing that gets 611 * downgraded are empty entrie 513 * downgraded are empty entries which don't need to be 612 * unmapped. 514 * unmapped. 613 */ 515 */ 614 if (dax_is_zero_entry(entry)) 516 if (dax_is_zero_entry(entry)) { 615 xas_unlock_irq(xas); 517 xas_unlock_irq(xas); 616 unmap_mapping_pages(ma 518 unmap_mapping_pages(mapping, 617 xas->x 519 xas->xa_index & ~PG_PMD_COLOUR, 618 PG_PMD 520 PG_PMD_NR, false); 619 xas_reset(xas); 521 xas_reset(xas); 620 xas_lock_irq(xas); 522 xas_lock_irq(xas); 621 } 523 } 622 524 623 dax_disassociate_entry(entry, 525 dax_disassociate_entry(entry, mapping, false); 624 xas_store(xas, NULL); /* und 526 xas_store(xas, NULL); /* undo the PMD join */ 625 dax_wake_entry(xas, entry, WAK !! 527 dax_wake_entry(xas, entry, true); 626 mapping->nrpages -= PG_PMD_NR; !! 528 mapping->nrexceptional--; 627 entry = NULL; 529 entry = NULL; 628 xas_set(xas, index); 530 xas_set(xas, index); 629 } 531 } 630 532 631 if (entry) { 533 if (entry) { 632 dax_lock_entry(xas, entry); 534 dax_lock_entry(xas, entry); 633 } else { 535 } else { 634 unsigned long flags = DAX_EMPT 536 unsigned long flags = DAX_EMPTY; 635 537 636 if (order > 0) 538 if (order > 0) 637 flags |= DAX_PMD; 539 flags |= DAX_PMD; 638 entry = dax_make_entry(pfn_to_ 540 entry = dax_make_entry(pfn_to_pfn_t(0), flags); 639 dax_lock_entry(xas, entry); 541 dax_lock_entry(xas, entry); 640 if (xas_error(xas)) 542 if (xas_error(xas)) 641 goto out_unlock; 543 goto out_unlock; 642 mapping->nrpages += 1UL << ord !! 544 mapping->nrexceptional++; 643 } 545 } 644 546 645 out_unlock: 547 out_unlock: 646 xas_unlock_irq(xas); 548 xas_unlock_irq(xas); 647 if (xas_nomem(xas, mapping_gfp_mask(ma 549 if (xas_nomem(xas, mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM)) 648 goto retry; 550 goto retry; 649 if (xas->xa_node == XA_ERROR(-ENOMEM)) 551 if (xas->xa_node == XA_ERROR(-ENOMEM)) 650 return xa_mk_internal(VM_FAULT 552 return xa_mk_internal(VM_FAULT_OOM); 651 if (xas_error(xas)) 553 if (xas_error(xas)) 652 return xa_mk_internal(VM_FAULT 554 return xa_mk_internal(VM_FAULT_SIGBUS); 653 return entry; 555 return entry; 654 fallback: 556 fallback: 655 xas_unlock_irq(xas); 557 xas_unlock_irq(xas); 656 return xa_mk_internal(VM_FAULT_FALLBAC 558 return xa_mk_internal(VM_FAULT_FALLBACK); 657 } 559 } 658 560 659 /** 561 /** 660 * dax_layout_busy_page_range - find first pin !! 562 * dax_layout_busy_page - find first pinned page in @mapping 661 * @mapping: address space to scan for a page 563 * @mapping: address space to scan for a page with ref count > 1 662 * @start: Starting offset. Page containing 's << 663 * @end: End offset. Page containing 'end' is << 664 * pages from 'start' till the end of fi << 665 * 564 * 666 * DAX requires ZONE_DEVICE mapped pages. Thes 565 * DAX requires ZONE_DEVICE mapped pages. These pages are never 667 * 'onlined' to the page allocator so they are 566 * 'onlined' to the page allocator so they are considered idle when 668 * page->count == 1. A filesystem uses this in 567 * page->count == 1. A filesystem uses this interface to determine if 669 * any page in the mapping is busy, i.e. for D 568 * any page in the mapping is busy, i.e. for DMA, or other 670 * get_user_pages() usages. 569 * get_user_pages() usages. 671 * 570 * 672 * It is expected that the filesystem is holdi 571 * It is expected that the filesystem is holding locks to block the 673 * establishment of new mappings in this addre 572 * establishment of new mappings in this address_space. I.e. it expects 674 * to be able to run unmap_mapping_range() and 573 * to be able to run unmap_mapping_range() and subsequently not race 675 * mapping_mapped() becoming true. 574 * mapping_mapped() becoming true. 676 */ 575 */ 677 struct page *dax_layout_busy_page_range(struct !! 576 struct page *dax_layout_busy_page(struct address_space *mapping) 678 loff_t << 679 { 577 { >> 578 XA_STATE(xas, &mapping->i_pages, 0); 680 void *entry; 579 void *entry; 681 unsigned int scanned = 0; 580 unsigned int scanned = 0; 682 struct page *page = NULL; 581 struct page *page = NULL; 683 pgoff_t start_idx = start >> PAGE_SHIF << 684 pgoff_t end_idx; << 685 XA_STATE(xas, &mapping->i_pages, start << 686 582 687 /* 583 /* 688 * In the 'limited' case get_user_page 584 * In the 'limited' case get_user_pages() for dax is disabled. 689 */ 585 */ 690 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 586 if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) 691 return NULL; 587 return NULL; 692 588 693 if (!dax_mapping(mapping) || !mapping_ 589 if (!dax_mapping(mapping) || !mapping_mapped(mapping)) 694 return NULL; 590 return NULL; 695 591 696 /* If end == LLONG_MAX, all pages from << 697 if (end == LLONG_MAX) << 698 end_idx = ULONG_MAX; << 699 else << 700 end_idx = end >> PAGE_SHIFT; << 701 /* 592 /* 702 * If we race get_user_pages_fast() he 593 * If we race get_user_pages_fast() here either we'll see the 703 * elevated page count in the iteratio 594 * elevated page count in the iteration and wait, or 704 * get_user_pages_fast() will see that 595 * get_user_pages_fast() will see that the page it took a reference 705 * against is no longer mapped in the 596 * against is no longer mapped in the page tables and bail to the 706 * get_user_pages() slow path. The sl 597 * get_user_pages() slow path. The slow path is protected by 707 * pte_lock() and pmd_lock(). New refe 598 * pte_lock() and pmd_lock(). New references are not taken without 708 * holding those locks, and unmap_mapp !! 599 * holding those locks, and unmap_mapping_range() will not zero the 709 * pte or pmd without holding the resp 600 * pte or pmd without holding the respective lock, so we are 710 * guaranteed to either see new refere 601 * guaranteed to either see new references or prevent new 711 * references from being established. 602 * references from being established. 712 */ 603 */ 713 unmap_mapping_pages(mapping, start_idx !! 604 unmap_mapping_range(mapping, 0, 0, 0); 714 605 715 xas_lock_irq(&xas); 606 xas_lock_irq(&xas); 716 xas_for_each(&xas, entry, end_idx) { !! 607 xas_for_each(&xas, entry, ULONG_MAX) { 717 if (WARN_ON_ONCE(!xa_is_value( 608 if (WARN_ON_ONCE(!xa_is_value(entry))) 718 continue; 609 continue; 719 if (unlikely(dax_is_locked(ent 610 if (unlikely(dax_is_locked(entry))) 720 entry = get_unlocked_e 611 entry = get_unlocked_entry(&xas, 0); 721 if (entry) 612 if (entry) 722 page = dax_busy_page(e 613 page = dax_busy_page(entry); 723 put_unlocked_entry(&xas, entry !! 614 put_unlocked_entry(&xas, entry); 724 if (page) 615 if (page) 725 break; 616 break; 726 if (++scanned % XA_CHECK_SCHED 617 if (++scanned % XA_CHECK_SCHED) 727 continue; 618 continue; 728 619 729 xas_pause(&xas); 620 xas_pause(&xas); 730 xas_unlock_irq(&xas); 621 xas_unlock_irq(&xas); 731 cond_resched(); 622 cond_resched(); 732 xas_lock_irq(&xas); 623 xas_lock_irq(&xas); 733 } 624 } 734 xas_unlock_irq(&xas); 625 xas_unlock_irq(&xas); 735 return page; 626 return page; 736 } 627 } 737 EXPORT_SYMBOL_GPL(dax_layout_busy_page_range); << 738 << 739 struct page *dax_layout_busy_page(struct addre << 740 { << 741 return dax_layout_busy_page_range(mapp << 742 } << 743 EXPORT_SYMBOL_GPL(dax_layout_busy_page); 628 EXPORT_SYMBOL_GPL(dax_layout_busy_page); 744 629 745 static int __dax_invalidate_entry(struct addre 630 static int __dax_invalidate_entry(struct address_space *mapping, 746 pgof 631 pgoff_t index, bool trunc) 747 { 632 { 748 XA_STATE(xas, &mapping->i_pages, index 633 XA_STATE(xas, &mapping->i_pages, index); 749 int ret = 0; 634 int ret = 0; 750 void *entry; 635 void *entry; 751 636 752 xas_lock_irq(&xas); 637 xas_lock_irq(&xas); 753 entry = get_unlocked_entry(&xas, 0); 638 entry = get_unlocked_entry(&xas, 0); 754 if (!entry || WARN_ON_ONCE(!xa_is_valu 639 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 755 goto out; 640 goto out; 756 if (!trunc && 641 if (!trunc && 757 (xas_get_mark(&xas, PAGECACHE_TAG_ 642 (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || 758 xas_get_mark(&xas, PAGECACHE_TAG_ 643 xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) 759 goto out; 644 goto out; 760 dax_disassociate_entry(entry, mapping, 645 dax_disassociate_entry(entry, mapping, trunc); 761 xas_store(&xas, NULL); 646 xas_store(&xas, NULL); 762 mapping->nrpages -= 1UL << dax_entry_o !! 647 mapping->nrexceptional--; 763 ret = 1; 648 ret = 1; 764 out: 649 out: 765 put_unlocked_entry(&xas, entry, WAKE_A !! 650 put_unlocked_entry(&xas, entry); 766 xas_unlock_irq(&xas); 651 xas_unlock_irq(&xas); 767 return ret; 652 return ret; 768 } 653 } 769 654 770 static int __dax_clear_dirty_range(struct addr << 771 pgoff_t start, pgoff_t end) << 772 { << 773 XA_STATE(xas, &mapping->i_pages, start << 774 unsigned int scanned = 0; << 775 void *entry; << 776 << 777 xas_lock_irq(&xas); << 778 xas_for_each(&xas, entry, end) { << 779 entry = get_unlocked_entry(&xa << 780 xas_clear_mark(&xas, PAGECACHE << 781 xas_clear_mark(&xas, PAGECACHE << 782 put_unlocked_entry(&xas, entry << 783 << 784 if (++scanned % XA_CHECK_SCHED << 785 continue; << 786 << 787 xas_pause(&xas); << 788 xas_unlock_irq(&xas); << 789 cond_resched(); << 790 xas_lock_irq(&xas); << 791 } << 792 xas_unlock_irq(&xas); << 793 << 794 return 0; << 795 } << 796 << 797 /* 655 /* 798 * Delete DAX entry at @index from @mapping. 656 * Delete DAX entry at @index from @mapping. Wait for it 799 * to be unlocked before deleting it. 657 * to be unlocked before deleting it. 800 */ 658 */ 801 int dax_delete_mapping_entry(struct address_sp 659 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) 802 { 660 { 803 int ret = __dax_invalidate_entry(mappi 661 int ret = __dax_invalidate_entry(mapping, index, true); 804 662 805 /* 663 /* 806 * This gets called from truncate / pu 664 * This gets called from truncate / punch_hole path. As such, the caller 807 * must hold locks protecting against 665 * must hold locks protecting against concurrent modifications of the 808 * page cache (usually fs-private i_mm 666 * page cache (usually fs-private i_mmap_sem for writing). Since the 809 * caller has seen a DAX entry for thi 667 * caller has seen a DAX entry for this index, we better find it 810 * at that index as well... 668 * at that index as well... 811 */ 669 */ 812 WARN_ON_ONCE(!ret); 670 WARN_ON_ONCE(!ret); 813 return ret; 671 return ret; 814 } 672 } 815 673 816 /* 674 /* 817 * Invalidate DAX entry if it is clean. 675 * Invalidate DAX entry if it is clean. 818 */ 676 */ 819 int dax_invalidate_mapping_entry_sync(struct a 677 int dax_invalidate_mapping_entry_sync(struct address_space *mapping, 820 pgoff_t 678 pgoff_t index) 821 { 679 { 822 return __dax_invalidate_entry(mapping, 680 return __dax_invalidate_entry(mapping, index, false); 823 } 681 } 824 682 825 static pgoff_t dax_iomap_pgoff(const struct io !! 683 static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev, 826 { !! 684 sector_t sector, size_t size, struct page *to, 827 return PHYS_PFN(iomap->addr + (pos & P !! 685 unsigned long vaddr) 828 } << 829 << 830 static int copy_cow_page_dax(struct vm_fault * << 831 { 686 { 832 pgoff_t pgoff = dax_iomap_pgoff(&iter- << 833 void *vto, *kaddr; 687 void *vto, *kaddr; >> 688 pgoff_t pgoff; 834 long rc; 689 long rc; 835 int id; 690 int id; 836 691 >> 692 rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); >> 693 if (rc) >> 694 return rc; >> 695 837 id = dax_read_lock(); 696 id = dax_read_lock(); 838 rc = dax_direct_access(iter->iomap.dax !! 697 rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, NULL); 839 &kaddr, NULL); << 840 if (rc < 0) { 698 if (rc < 0) { 841 dax_read_unlock(id); 699 dax_read_unlock(id); 842 return rc; 700 return rc; 843 } 701 } 844 vto = kmap_atomic(vmf->cow_page); !! 702 vto = kmap_atomic(to); 845 copy_user_page(vto, kaddr, vmf->addres !! 703 copy_user_page(vto, (void __force *)kaddr, vaddr, to); 846 kunmap_atomic(vto); 704 kunmap_atomic(vto); 847 dax_read_unlock(id); 705 dax_read_unlock(id); 848 return 0; 706 return 0; 849 } 707 } 850 708 851 /* 709 /* 852 * MAP_SYNC on a dax mapping guarantees dirty << 853 * flushed on write-faults (non-cow), but not << 854 */ << 855 static bool dax_fault_is_synchronous(const str << 856 struct vm_area_struct *vma) << 857 { << 858 return (iter->flags & IOMAP_WRITE) && << 859 (iter->iomap.flags & IOMAP_F_D << 860 } << 861 << 862 /* << 863 * By this point grab_mapping_entry() has ensu 710 * By this point grab_mapping_entry() has ensured that we have a locked entry 864 * of the appropriate size so we don't have to 711 * of the appropriate size so we don't have to worry about downgrading PMDs to 865 * PTEs. If we happen to be trying to insert 712 * PTEs. If we happen to be trying to insert a PTE and there is a PMD 866 * already in the tree, we will skip the inser 713 * already in the tree, we will skip the insertion and just dirty the PMD as 867 * appropriate. 714 * appropriate. 868 */ 715 */ 869 static void *dax_insert_entry(struct xa_state !! 716 static void *dax_insert_entry(struct xa_state *xas, 870 const struct iomap_iter *iter, !! 717 struct address_space *mapping, struct vm_fault *vmf, 871 unsigned long flags) !! 718 void *entry, pfn_t pfn, unsigned long flags, bool dirty) 872 { 719 { 873 struct address_space *mapping = vmf->v << 874 void *new_entry = dax_make_entry(pfn, 720 void *new_entry = dax_make_entry(pfn, flags); 875 bool write = iter->flags & IOMAP_WRITE << 876 bool dirty = write && !dax_fault_is_sy << 877 bool shared = iter->iomap.flags & IOMA << 878 721 879 if (dirty) 722 if (dirty) 880 __mark_inode_dirty(mapping->ho 723 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); 881 724 882 if (shared || (dax_is_zero_entry(entry !! 725 if (dax_is_zero_entry(entry) && !(flags & DAX_ZERO_PAGE)) { 883 unsigned long index = xas->xa_ 726 unsigned long index = xas->xa_index; 884 /* we are replacing a zero pag 727 /* we are replacing a zero page with block mapping */ 885 if (dax_is_pmd_entry(entry)) 728 if (dax_is_pmd_entry(entry)) 886 unmap_mapping_pages(ma 729 unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, 887 PG_PMD 730 PG_PMD_NR, false); 888 else /* pte entry */ 731 else /* pte entry */ 889 unmap_mapping_pages(ma 732 unmap_mapping_pages(mapping, index, 1, false); 890 } 733 } 891 734 892 xas_reset(xas); 735 xas_reset(xas); 893 xas_lock_irq(xas); 736 xas_lock_irq(xas); 894 if (shared || dax_is_zero_entry(entry) !! 737 if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { 895 void *old; 738 void *old; 896 739 897 dax_disassociate_entry(entry, 740 dax_disassociate_entry(entry, mapping, false); 898 dax_associate_entry(new_entry, !! 741 dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); 899 shared); << 900 /* 742 /* 901 * Only swap our new entry int 743 * Only swap our new entry into the page cache if the current 902 * entry is a zero page or an 744 * entry is a zero page or an empty entry. If a normal PTE or 903 * PMD entry is already in the 745 * PMD entry is already in the cache, we leave it alone. This 904 * means that if we are trying 746 * means that if we are trying to insert a PTE and the 905 * existing entry is a PMD, we 747 * existing entry is a PMD, we will just leave the PMD in the 906 * tree and dirty it if necess 748 * tree and dirty it if necessary. 907 */ 749 */ 908 old = dax_lock_entry(xas, new_ 750 old = dax_lock_entry(xas, new_entry); 909 WARN_ON_ONCE(old != xa_mk_valu 751 WARN_ON_ONCE(old != xa_mk_value(xa_to_value(entry) | 910 DAX_LO 752 DAX_LOCKED)); 911 entry = new_entry; 753 entry = new_entry; 912 } else { 754 } else { 913 xas_load(xas); /* Walk the xa 755 xas_load(xas); /* Walk the xa_state */ 914 } 756 } 915 757 916 if (dirty) 758 if (dirty) 917 xas_set_mark(xas, PAGECACHE_TA 759 xas_set_mark(xas, PAGECACHE_TAG_DIRTY); 918 760 919 if (write && shared) << 920 xas_set_mark(xas, PAGECACHE_TA << 921 << 922 xas_unlock_irq(xas); 761 xas_unlock_irq(xas); 923 return entry; 762 return entry; 924 } 763 } 925 764 >> 765 static inline >> 766 unsigned long pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma) >> 767 { >> 768 unsigned long address; >> 769 >> 770 address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); >> 771 VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma); >> 772 return address; >> 773 } >> 774 >> 775 /* Walk all mappings of a given index of a file and writeprotect them */ >> 776 static void dax_entry_mkclean(struct address_space *mapping, pgoff_t index, >> 777 unsigned long pfn) >> 778 { >> 779 struct vm_area_struct *vma; >> 780 pte_t pte, *ptep = NULL; >> 781 pmd_t *pmdp = NULL; >> 782 spinlock_t *ptl; >> 783 >> 784 i_mmap_lock_read(mapping); >> 785 vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) { >> 786 struct mmu_notifier_range range; >> 787 unsigned long address; >> 788 >> 789 cond_resched(); >> 790 >> 791 if (!(vma->vm_flags & VM_SHARED)) >> 792 continue; >> 793 >> 794 address = pgoff_address(index, vma); >> 795 >> 796 /* >> 797 * Note because we provide range to follow_pte_pmd it will >> 798 * call mmu_notifier_invalidate_range_start() on our behalf >> 799 * before taking any lock. >> 800 */ >> 801 if (follow_pte_pmd(vma->vm_mm, address, &range, >> 802 &ptep, &pmdp, &ptl)) >> 803 continue; >> 804 >> 805 /* >> 806 * No need to call mmu_notifier_invalidate_range() as we are >> 807 * downgrading page table protection not changing it to point >> 808 * to a new page. >> 809 * >> 810 * See Documentation/vm/mmu_notifier.rst >> 811 */ >> 812 if (pmdp) { >> 813 #ifdef CONFIG_FS_DAX_PMD >> 814 pmd_t pmd; >> 815 >> 816 if (pfn != pmd_pfn(*pmdp)) >> 817 goto unlock_pmd; >> 818 if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp)) >> 819 goto unlock_pmd; >> 820 >> 821 flush_cache_page(vma, address, pfn); >> 822 pmd = pmdp_invalidate(vma, address, pmdp); >> 823 pmd = pmd_wrprotect(pmd); >> 824 pmd = pmd_mkclean(pmd); >> 825 set_pmd_at(vma->vm_mm, address, pmdp, pmd); >> 826 unlock_pmd: >> 827 #endif >> 828 spin_unlock(ptl); >> 829 } else { >> 830 if (pfn != pte_pfn(*ptep)) >> 831 goto unlock_pte; >> 832 if (!pte_dirty(*ptep) && !pte_write(*ptep)) >> 833 goto unlock_pte; >> 834 >> 835 flush_cache_page(vma, address, pfn); >> 836 pte = ptep_clear_flush(vma, address, ptep); >> 837 pte = pte_wrprotect(pte); >> 838 pte = pte_mkclean(pte); >> 839 set_pte_at(vma->vm_mm, address, ptep, pte); >> 840 unlock_pte: >> 841 pte_unmap_unlock(ptep, ptl); >> 842 } >> 843 >> 844 mmu_notifier_invalidate_range_end(&range); >> 845 } >> 846 i_mmap_unlock_read(mapping); >> 847 } >> 848 926 static int dax_writeback_one(struct xa_state * 849 static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, 927 struct address_space *mapping, 850 struct address_space *mapping, void *entry) 928 { 851 { 929 unsigned long pfn, index, count, end; !! 852 unsigned long pfn, index, count; 930 long ret = 0; 853 long ret = 0; 931 struct vm_area_struct *vma; << 932 854 933 /* 855 /* 934 * A page got tagged dirty in DAX mapp 856 * A page got tagged dirty in DAX mapping? Something is seriously 935 * wrong. 857 * wrong. 936 */ 858 */ 937 if (WARN_ON(!xa_is_value(entry))) 859 if (WARN_ON(!xa_is_value(entry))) 938 return -EIO; 860 return -EIO; 939 861 940 if (unlikely(dax_is_locked(entry))) { 862 if (unlikely(dax_is_locked(entry))) { 941 void *old_entry = entry; 863 void *old_entry = entry; 942 864 943 entry = get_unlocked_entry(xas 865 entry = get_unlocked_entry(xas, 0); 944 866 945 /* Entry got punched out / rea 867 /* Entry got punched out / reallocated? */ 946 if (!entry || WARN_ON_ONCE(!xa 868 if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) 947 goto put_unlocked; 869 goto put_unlocked; 948 /* 870 /* 949 * Entry got reallocated elsew 871 * Entry got reallocated elsewhere? No need to writeback. 950 * We have to compare pfns as 872 * We have to compare pfns as we must not bail out due to 951 * difference in lockbit or en 873 * difference in lockbit or entry type. 952 */ 874 */ 953 if (dax_to_pfn(old_entry) != d 875 if (dax_to_pfn(old_entry) != dax_to_pfn(entry)) 954 goto put_unlocked; 876 goto put_unlocked; 955 if (WARN_ON_ONCE(dax_is_empty_ 877 if (WARN_ON_ONCE(dax_is_empty_entry(entry) || 956 dax_is 878 dax_is_zero_entry(entry))) { 957 ret = -EIO; 879 ret = -EIO; 958 goto put_unlocked; 880 goto put_unlocked; 959 } 881 } 960 882 961 /* Another fsync thread may ha 883 /* Another fsync thread may have already done this entry */ 962 if (!xas_get_mark(xas, PAGECAC 884 if (!xas_get_mark(xas, PAGECACHE_TAG_TOWRITE)) 963 goto put_unlocked; 885 goto put_unlocked; 964 } 886 } 965 887 966 /* Lock the entry to serialize with pa 888 /* Lock the entry to serialize with page faults */ 967 dax_lock_entry(xas, entry); 889 dax_lock_entry(xas, entry); 968 890 969 /* 891 /* 970 * We can clear the tag now but we hav 892 * We can clear the tag now but we have to be careful so that concurrent 971 * dax_writeback_one() calls for the s 893 * dax_writeback_one() calls for the same index cannot finish before we 972 * actually flush the caches. This is 894 * actually flush the caches. This is achieved as the calls will look 973 * at the entry only under the i_pages 895 * at the entry only under the i_pages lock and once they do that 974 * they will see the entry locked and 896 * they will see the entry locked and wait for it to unlock. 975 */ 897 */ 976 xas_clear_mark(xas, PAGECACHE_TAG_TOWR 898 xas_clear_mark(xas, PAGECACHE_TAG_TOWRITE); 977 xas_unlock_irq(xas); 899 xas_unlock_irq(xas); 978 900 979 /* 901 /* 980 * If dax_writeback_mapping_range() wa 902 * If dax_writeback_mapping_range() was given a wbc->range_start 981 * in the middle of a PMD, the 'index' 903 * in the middle of a PMD, the 'index' we use needs to be 982 * aligned to the start of the PMD. 904 * aligned to the start of the PMD. 983 * This allows us to flush for PMD_SIZ 905 * This allows us to flush for PMD_SIZE and not have to worry about 984 * partial PMD writebacks. 906 * partial PMD writebacks. 985 */ 907 */ 986 pfn = dax_to_pfn(entry); 908 pfn = dax_to_pfn(entry); 987 count = 1UL << dax_entry_order(entry); 909 count = 1UL << dax_entry_order(entry); 988 index = xas->xa_index & ~(count - 1); 910 index = xas->xa_index & ~(count - 1); 989 end = index + count - 1; << 990 << 991 /* Walk all mappings of a given index << 992 i_mmap_lock_read(mapping); << 993 vma_interval_tree_foreach(vma, &mappin << 994 pfn_mkclean_range(pfn, count, << 995 cond_resched(); << 996 } << 997 i_mmap_unlock_read(mapping); << 998 911 >> 912 dax_entry_mkclean(mapping, index, pfn); 999 dax_flush(dax_dev, page_address(pfn_to 913 dax_flush(dax_dev, page_address(pfn_to_page(pfn)), count * PAGE_SIZE); 1000 /* 914 /* 1001 * After we have flushed the cache, w 915 * After we have flushed the cache, we can clear the dirty tag. There 1002 * cannot be new dirty data in the pf 916 * cannot be new dirty data in the pfn after the flush has completed as 1003 * the pfn mappings are writeprotecte 917 * the pfn mappings are writeprotected and fault waits for mapping 1004 * entry lock. 918 * entry lock. 1005 */ 919 */ 1006 xas_reset(xas); 920 xas_reset(xas); 1007 xas_lock_irq(xas); 921 xas_lock_irq(xas); 1008 xas_store(xas, entry); 922 xas_store(xas, entry); 1009 xas_clear_mark(xas, PAGECACHE_TAG_DIR 923 xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); 1010 dax_wake_entry(xas, entry, WAKE_NEXT) !! 924 dax_wake_entry(xas, entry, false); 1011 925 1012 trace_dax_writeback_one(mapping->host 926 trace_dax_writeback_one(mapping->host, index, count); 1013 return ret; 927 return ret; 1014 928 1015 put_unlocked: 929 put_unlocked: 1016 put_unlocked_entry(xas, entry, WAKE_N !! 930 put_unlocked_entry(xas, entry); 1017 return ret; 931 return ret; 1018 } 932 } 1019 933 1020 /* 934 /* 1021 * Flush the mapping to the persistent domain 935 * Flush the mapping to the persistent domain within the byte range of [start, 1022 * end]. This is required by data integrity o 936 * end]. This is required by data integrity operations to ensure file data is 1023 * on persistent storage prior to completion 937 * on persistent storage prior to completion of the operation. 1024 */ 938 */ 1025 int dax_writeback_mapping_range(struct addres 939 int dax_writeback_mapping_range(struct address_space *mapping, 1026 struct dax_device *dax_dev, s 940 struct dax_device *dax_dev, struct writeback_control *wbc) 1027 { 941 { 1028 XA_STATE(xas, &mapping->i_pages, wbc- 942 XA_STATE(xas, &mapping->i_pages, wbc->range_start >> PAGE_SHIFT); 1029 struct inode *inode = mapping->host; 943 struct inode *inode = mapping->host; 1030 pgoff_t end_index = wbc->range_end >> 944 pgoff_t end_index = wbc->range_end >> PAGE_SHIFT; 1031 void *entry; 945 void *entry; 1032 int ret = 0; 946 int ret = 0; 1033 unsigned int scanned = 0; 947 unsigned int scanned = 0; 1034 948 1035 if (WARN_ON_ONCE(inode->i_blkbits != 949 if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) 1036 return -EIO; 950 return -EIO; 1037 951 1038 if (mapping_empty(mapping) || wbc->sy !! 952 if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) 1039 return 0; 953 return 0; 1040 954 1041 trace_dax_writeback_range(inode, xas. 955 trace_dax_writeback_range(inode, xas.xa_index, end_index); 1042 956 1043 tag_pages_for_writeback(mapping, xas. 957 tag_pages_for_writeback(mapping, xas.xa_index, end_index); 1044 958 1045 xas_lock_irq(&xas); 959 xas_lock_irq(&xas); 1046 xas_for_each_marked(&xas, entry, end_ 960 xas_for_each_marked(&xas, entry, end_index, PAGECACHE_TAG_TOWRITE) { 1047 ret = dax_writeback_one(&xas, 961 ret = dax_writeback_one(&xas, dax_dev, mapping, entry); 1048 if (ret < 0) { 962 if (ret < 0) { 1049 mapping_set_error(map 963 mapping_set_error(mapping, ret); 1050 break; 964 break; 1051 } 965 } 1052 if (++scanned % XA_CHECK_SCHE 966 if (++scanned % XA_CHECK_SCHED) 1053 continue; 967 continue; 1054 968 1055 xas_pause(&xas); 969 xas_pause(&xas); 1056 xas_unlock_irq(&xas); 970 xas_unlock_irq(&xas); 1057 cond_resched(); 971 cond_resched(); 1058 xas_lock_irq(&xas); 972 xas_lock_irq(&xas); 1059 } 973 } 1060 xas_unlock_irq(&xas); 974 xas_unlock_irq(&xas); 1061 trace_dax_writeback_range_done(inode, 975 trace_dax_writeback_range_done(inode, xas.xa_index, end_index); 1062 return ret; 976 return ret; 1063 } 977 } 1064 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range 978 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); 1065 979 1066 static int dax_iomap_direct_access(const stru !! 980 static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos) 1067 size_t size, void **kaddr, pf !! 981 { >> 982 return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9; >> 983 } >> 984 >> 985 static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size, >> 986 pfn_t *pfnp) 1068 { 987 { 1069 pgoff_t pgoff = dax_iomap_pgoff(iomap !! 988 const sector_t sector = dax_iomap_sector(iomap, pos); 1070 int id, rc = 0; !! 989 pgoff_t pgoff; >> 990 int id, rc; 1071 long length; 991 long length; 1072 992 >> 993 rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff); >> 994 if (rc) >> 995 return rc; 1073 id = dax_read_lock(); 996 id = dax_read_lock(); 1074 length = dax_direct_access(iomap->dax 997 length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size), 1075 DAX_ACCESS !! 998 NULL, pfnp); 1076 if (length < 0) { 999 if (length < 0) { 1077 rc = length; 1000 rc = length; 1078 goto out; 1001 goto out; 1079 } 1002 } 1080 if (!pfnp) << 1081 goto out_check_addr; << 1082 rc = -EINVAL; 1003 rc = -EINVAL; 1083 if (PFN_PHYS(length) < size) 1004 if (PFN_PHYS(length) < size) 1084 goto out; 1005 goto out; 1085 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(s 1006 if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) 1086 goto out; 1007 goto out; 1087 /* For larger pages we need devmap */ 1008 /* For larger pages we need devmap */ 1088 if (length > 1 && !pfn_t_devmap(*pfnp 1009 if (length > 1 && !pfn_t_devmap(*pfnp)) 1089 goto out; 1010 goto out; 1090 rc = 0; 1011 rc = 0; 1091 << 1092 out_check_addr: << 1093 if (!kaddr) << 1094 goto out; << 1095 if (!*kaddr) << 1096 rc = -EFAULT; << 1097 out: 1012 out: 1098 dax_read_unlock(id); 1013 dax_read_unlock(id); 1099 return rc; 1014 return rc; 1100 } 1015 } 1101 1016 1102 /** << 1103 * dax_iomap_copy_around - Prepare for an una << 1104 * by copying the data before and after the r << 1105 * @pos: address to do copy from. << 1106 * @length: size of copy operation. << 1107 * @align_size: aligned w.r.t align_size (eit << 1108 * @srcmap: iomap srcmap << 1109 * @daddr: destination address to copy t << 1110 * << 1111 * This can be called from two places. Either << 1112 * aligned), to copy the length size data to << 1113 * write operation, dax_iomap_iter() might ca << 1114 * start or end unaligned address. In the lat << 1115 * aligned ranges is taken care by dax_iomap_ << 1116 * If the srcmap contains invalid data, such << 1117 * area to make sure no old data remains. << 1118 */ << 1119 static int dax_iomap_copy_around(loff_t pos, << 1120 const struct iomap *srcmap, v << 1121 { << 1122 loff_t head_off = pos & (align_size - << 1123 size_t size = ALIGN(head_off + length << 1124 loff_t end = pos + length; << 1125 loff_t pg_end = round_up(end, align_s << 1126 /* copy_all is usually in page fault << 1127 bool copy_all = head_off == 0 && end << 1128 /* zero the edges if srcmap is a HOLE << 1129 bool zero_edge = srcmap->flags & IOMA << 1130 srcmap->type == IOMA << 1131 void *saddr = NULL; << 1132 int ret = 0; << 1133 << 1134 if (!zero_edge) { << 1135 ret = dax_iomap_direct_access << 1136 if (ret) << 1137 return dax_mem2blk_er << 1138 } << 1139 << 1140 if (copy_all) { << 1141 if (zero_edge) << 1142 memset(daddr, 0, size << 1143 else << 1144 ret = copy_mc_to_kern << 1145 goto out; << 1146 } << 1147 << 1148 /* Copy the head part of the range */ << 1149 if (head_off) { << 1150 if (zero_edge) << 1151 memset(daddr, 0, head << 1152 else { << 1153 ret = copy_mc_to_kern << 1154 if (ret) << 1155 return -EIO; << 1156 } << 1157 } << 1158 << 1159 /* Copy the tail part of the range */ << 1160 if (end < pg_end) { << 1161 loff_t tail_off = head_off + << 1162 loff_t tail_len = pg_end - en << 1163 << 1164 if (zero_edge) << 1165 memset(daddr + tail_o << 1166 else { << 1167 ret = copy_mc_to_kern << 1168 << 1169 if (ret) << 1170 return -EIO; << 1171 } << 1172 } << 1173 out: << 1174 if (zero_edge) << 1175 dax_flush(srcmap->dax_dev, da << 1176 return ret ? -EIO : 0; << 1177 } << 1178 << 1179 /* 1017 /* 1180 * The user has performed a load from a hole 1018 * The user has performed a load from a hole in the file. Allocating a new 1181 * page in the file would cause excessive sto 1019 * page in the file would cause excessive storage usage for workloads with 1182 * sparse files. Instead we insert a read-on 1020 * sparse files. Instead we insert a read-only mapping of the 4k zero page. 1183 * If this page is ever written to we will re 1021 * If this page is ever written to we will re-fault and change the mapping to 1184 * point to real DAX storage instead. 1022 * point to real DAX storage instead. 1185 */ 1023 */ 1186 static vm_fault_t dax_load_hole(struct xa_sta !! 1024 static vm_fault_t dax_load_hole(struct xa_state *xas, 1187 const struct iomap_iter *iter !! 1025 struct address_space *mapping, void **entry, >> 1026 struct vm_fault *vmf) 1188 { 1027 { 1189 struct inode *inode = iter->inode; !! 1028 struct inode *inode = mapping->host; 1190 unsigned long vaddr = vmf->address; 1029 unsigned long vaddr = vmf->address; 1191 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn( 1030 pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(vaddr)); 1192 vm_fault_t ret; 1031 vm_fault_t ret; 1193 1032 1194 *entry = dax_insert_entry(xas, vmf, i !! 1033 *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, >> 1034 DAX_ZERO_PAGE, false); 1195 1035 1196 ret = vmf_insert_mixed(vmf->vma, vadd 1036 ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); 1197 trace_dax_load_hole(inode, vmf, ret); 1037 trace_dax_load_hole(inode, vmf, ret); 1198 return ret; 1038 return ret; 1199 } 1039 } 1200 1040 1201 #ifdef CONFIG_FS_DAX_PMD !! 1041 int dax_iomap_zero(loff_t pos, unsigned offset, unsigned size, 1202 static vm_fault_t dax_pmd_load_hole(struct xa !! 1042 struct iomap *iomap) 1203 const struct iomap_iter *iter << 1204 { << 1205 struct address_space *mapping = vmf-> << 1206 unsigned long pmd_addr = vmf->address << 1207 struct vm_area_struct *vma = vmf->vma << 1208 struct inode *inode = mapping->host; << 1209 pgtable_t pgtable = NULL; << 1210 struct folio *zero_folio; << 1211 spinlock_t *ptl; << 1212 pmd_t pmd_entry; << 1213 pfn_t pfn; << 1214 << 1215 zero_folio = mm_get_huge_zero_folio(v << 1216 << 1217 if (unlikely(!zero_folio)) << 1218 goto fallback; << 1219 << 1220 pfn = page_to_pfn_t(&zero_folio->page << 1221 *entry = dax_insert_entry(xas, vmf, i << 1222 DAX_PMD | D << 1223 << 1224 if (arch_needs_pgtable_deposit()) { << 1225 pgtable = pte_alloc_one(vma-> << 1226 if (!pgtable) << 1227 return VM_FAULT_OOM; << 1228 } << 1229 << 1230 ptl = pmd_lock(vmf->vma->vm_mm, vmf-> << 1231 if (!pmd_none(*(vmf->pmd))) { << 1232 spin_unlock(ptl); << 1233 goto fallback; << 1234 } << 1235 << 1236 if (pgtable) { << 1237 pgtable_trans_huge_deposit(vm << 1238 mm_inc_nr_ptes(vma->vm_mm); << 1239 } << 1240 pmd_entry = mk_pmd(&zero_folio->page, << 1241 pmd_entry = pmd_mkhuge(pmd_entry); << 1242 set_pmd_at(vmf->vma->vm_mm, pmd_addr, << 1243 spin_unlock(ptl); << 1244 trace_dax_pmd_load_hole(inode, vmf, z << 1245 return VM_FAULT_NOPAGE; << 1246 << 1247 fallback: << 1248 if (pgtable) << 1249 pte_free(vma->vm_mm, pgtable) << 1250 trace_dax_pmd_load_hole_fallback(inod << 1251 return VM_FAULT_FALLBACK; << 1252 } << 1253 #else << 1254 static vm_fault_t dax_pmd_load_hole(struct xa << 1255 const struct iomap_iter *iter << 1256 { << 1257 return VM_FAULT_FALLBACK; << 1258 } << 1259 #endif /* CONFIG_FS_DAX_PMD */ << 1260 << 1261 static s64 dax_unshare_iter(struct iomap_iter << 1262 { 1043 { 1263 struct iomap *iomap = &iter->iomap; !! 1044 sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); 1264 const struct iomap *srcmap = iomap_it !! 1045 pgoff_t pgoff; 1265 loff_t copy_pos = iter->pos; !! 1046 long rc, id; 1266 u64 copy_len = iomap_length(iter); !! 1047 void *kaddr; 1267 u32 mod; !! 1048 bool page_aligned = false; 1268 int id = 0; << 1269 s64 ret = 0; << 1270 void *daddr = NULL, *saddr = NULL; << 1271 1049 1272 if (!iomap_want_unshare_iter(iter)) << 1273 return iomap_length(iter); << 1274 1050 1275 /* !! 1051 if (IS_ALIGNED(sector << SECTOR_SHIFT, PAGE_SIZE) && 1276 * Extend the file range to be aligne !! 1052 IS_ALIGNED(size, PAGE_SIZE)) 1277 * we need to copy entire blocks, not !! 1053 page_aligned = true; 1278 * Invalidate the mapping because we' << 1279 */ << 1280 mod = offset_in_page(copy_pos); << 1281 if (mod) { << 1282 copy_len += mod; << 1283 copy_pos -= mod; << 1284 } << 1285 1054 1286 mod = offset_in_page(copy_pos + copy_ !! 1055 rc = bdev_dax_pgoff(iomap->bdev, sector, PAGE_SIZE, &pgoff); 1287 if (mod) !! 1056 if (rc) 1288 copy_len += PAGE_SIZE - mod; !! 1057 return rc; 1289 << 1290 invalidate_inode_pages2_range(iter->i << 1291 copy_po << 1292 (copy_p << 1293 1058 1294 id = dax_read_lock(); 1059 id = dax_read_lock(); 1295 ret = dax_iomap_direct_access(iomap, << 1296 if (ret < 0) << 1297 goto out_unlock; << 1298 << 1299 ret = dax_iomap_direct_access(srcmap, << 1300 if (ret < 0) << 1301 goto out_unlock; << 1302 1060 1303 if (copy_mc_to_kernel(daddr, saddr, c !! 1061 if (page_aligned) 1304 ret = iomap_length(iter); !! 1062 rc = dax_zero_page_range(iomap->dax_dev, pgoff, >> 1063 size >> PAGE_SHIFT); 1305 else 1064 else 1306 ret = -EIO; !! 1065 rc = dax_direct_access(iomap->dax_dev, pgoff, 1, &kaddr, NULL); 1307 !! 1066 if (rc < 0) { 1308 out_unlock: << 1309 dax_read_unlock(id); << 1310 return dax_mem2blk_err(ret); << 1311 } << 1312 << 1313 int dax_file_unshare(struct inode *inode, lof << 1314 const struct iomap_ops *ops) << 1315 { << 1316 struct iomap_iter iter = { << 1317 .inode = inode, << 1318 .pos = pos, << 1319 .flags = IOMAP_WRITE << 1320 }; << 1321 loff_t size = i_size_read(inode); << 1322 int ret; << 1323 << 1324 if (pos < 0 || pos >= size) << 1325 return 0; << 1326 << 1327 iter.len = min(len, size - pos); << 1328 while ((ret = iomap_iter(&iter, ops)) << 1329 iter.processed = dax_unshare_ << 1330 return ret; << 1331 } << 1332 EXPORT_SYMBOL_GPL(dax_file_unshare); << 1333 << 1334 static int dax_memzero(struct iomap_iter *ite << 1335 { << 1336 const struct iomap *iomap = &iter->io << 1337 const struct iomap *srcmap = iomap_it << 1338 unsigned offset = offset_in_page(pos) << 1339 pgoff_t pgoff = dax_iomap_pgoff(iomap << 1340 void *kaddr; << 1341 long ret; << 1342 << 1343 ret = dax_direct_access(iomap->dax_de << 1344 NULL); << 1345 if (ret < 0) << 1346 return dax_mem2blk_err(ret); << 1347 << 1348 memset(kaddr + offset, 0, size); << 1349 if (iomap->flags & IOMAP_F_SHARED) << 1350 ret = dax_iomap_copy_around(p << 1351 k << 1352 else << 1353 dax_flush(iomap->dax_dev, kad << 1354 return ret; << 1355 } << 1356 << 1357 static s64 dax_zero_iter(struct iomap_iter *i << 1358 { << 1359 const struct iomap *iomap = &iter->io << 1360 const struct iomap *srcmap = iomap_it << 1361 loff_t pos = iter->pos; << 1362 u64 length = iomap_length(iter); << 1363 s64 written = 0; << 1364 << 1365 /* already zeroed? we're done. */ << 1366 if (srcmap->type == IOMAP_HOLE || src << 1367 return length; << 1368 << 1369 /* << 1370 * invalidate the pages whose sharing << 1371 * because of CoW. << 1372 */ << 1373 if (iomap->flags & IOMAP_F_SHARED) << 1374 invalidate_inode_pages2_range << 1375 << 1376 << 1377 << 1378 do { << 1379 unsigned offset = offset_in_p << 1380 unsigned size = min_t(u64, PA << 1381 pgoff_t pgoff = dax_iomap_pgo << 1382 long rc; << 1383 int id; << 1384 << 1385 id = dax_read_lock(); << 1386 if (IS_ALIGNED(pos, PAGE_SIZE << 1387 rc = dax_zero_page_ra << 1388 else << 1389 rc = dax_memzero(iter << 1390 dax_read_unlock(id); 1067 dax_read_unlock(id); >> 1068 return rc; >> 1069 } 1391 1070 1392 if (rc < 0) !! 1071 if (!page_aligned) { 1393 return rc; !! 1072 memset(kaddr + offset, 0, size); 1394 pos += size; !! 1073 dax_flush(iomap->dax_dev, kaddr + offset, size); 1395 length -= size; !! 1074 } 1396 written += size; !! 1075 dax_read_unlock(id); 1397 } while (length > 0); !! 1076 return 0; 1398 << 1399 if (did_zero) << 1400 *did_zero = true; << 1401 return written; << 1402 } << 1403 << 1404 int dax_zero_range(struct inode *inode, loff_ << 1405 const struct iomap_ops *ops) << 1406 { << 1407 struct iomap_iter iter = { << 1408 .inode = inode, << 1409 .pos = pos, << 1410 .len = len, << 1411 .flags = IOMAP_DAX | << 1412 }; << 1413 int ret; << 1414 << 1415 while ((ret = iomap_iter(&iter, ops)) << 1416 iter.processed = dax_zero_ite << 1417 return ret; << 1418 } << 1419 EXPORT_SYMBOL_GPL(dax_zero_range); << 1420 << 1421 int dax_truncate_page(struct inode *inode, lo << 1422 const struct iomap_ops *ops) << 1423 { << 1424 unsigned int blocksize = i_blocksize( << 1425 unsigned int off = pos & (blocksize - << 1426 << 1427 /* Block boundary? Nothing to do */ << 1428 if (!off) << 1429 return 0; << 1430 return dax_zero_range(inode, pos, blo << 1431 } 1077 } 1432 EXPORT_SYMBOL_GPL(dax_truncate_page); << 1433 1078 1434 static loff_t dax_iomap_iter(const struct iom !! 1079 static loff_t 1435 struct iov_iter *iter) !! 1080 dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, >> 1081 struct iomap *iomap, struct iomap *srcmap) 1436 { 1082 { 1437 const struct iomap *iomap = &iomi->io !! 1083 struct block_device *bdev = iomap->bdev; 1438 const struct iomap *srcmap = iomap_it << 1439 loff_t length = iomap_length(iomi); << 1440 loff_t pos = iomi->pos; << 1441 struct dax_device *dax_dev = iomap->d 1084 struct dax_device *dax_dev = iomap->dax_dev; >> 1085 struct iov_iter *iter = data; 1442 loff_t end = pos + length, done = 0; 1086 loff_t end = pos + length, done = 0; 1443 bool write = iov_iter_rw(iter) == WRI << 1444 bool cow = write && iomap->flags & IO << 1445 ssize_t ret = 0; 1087 ssize_t ret = 0; 1446 size_t xfer; 1088 size_t xfer; 1447 int id; 1089 int id; 1448 1090 1449 if (!write) { !! 1091 if (iov_iter_rw(iter) == READ) { 1450 end = min(end, i_size_read(io !! 1092 end = min(end, i_size_read(inode)); 1451 if (pos >= end) 1093 if (pos >= end) 1452 return 0; 1094 return 0; 1453 1095 1454 if (iomap->type == IOMAP_HOLE 1096 if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) 1455 return iov_iter_zero( 1097 return iov_iter_zero(min(length, end - pos), iter); 1456 } 1098 } 1457 1099 1458 /* !! 1100 if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED)) 1459 * In DAX mode, enforce either pure o << 1460 * writes to unwritten extents as par << 1461 */ << 1462 if (WARN_ON_ONCE(iomap->type != IOMAP << 1463 !(iomap->flags & IOMA << 1464 return -EIO; 1101 return -EIO; 1465 1102 1466 /* 1103 /* 1467 * Write can allocate block for an ar 1104 * Write can allocate block for an area which has a hole page mapped 1468 * into page tables. We have to tear 1105 * into page tables. We have to tear down these mappings so that data 1469 * written by write(2) is visible in 1106 * written by write(2) is visible in mmap. 1470 */ 1107 */ 1471 if (iomap->flags & IOMAP_F_NEW || cow !! 1108 if (iomap->flags & IOMAP_F_NEW) { 1472 /* !! 1109 invalidate_inode_pages2_range(inode->i_mapping, 1473 * Filesystem allows CoW on n << 1474 * may have been mmapped with << 1475 * invalidate its dax entries << 1476 * in advance. << 1477 */ << 1478 if (cow) << 1479 __dax_clear_dirty_ran << 1480 << 1481 << 1482 invalidate_inode_pages2_range << 1483 1110 pos >> PAGE_SHIFT, 1484 1111 (end - 1) >> PAGE_SHIFT); 1485 } 1112 } 1486 1113 1487 id = dax_read_lock(); 1114 id = dax_read_lock(); 1488 while (pos < end) { 1115 while (pos < end) { 1489 unsigned offset = pos & (PAGE 1116 unsigned offset = pos & (PAGE_SIZE - 1); 1490 const size_t size = ALIGN(len 1117 const size_t size = ALIGN(length + offset, PAGE_SIZE); 1491 pgoff_t pgoff = dax_iomap_pgo !! 1118 const sector_t sector = dax_iomap_sector(iomap, pos); 1492 ssize_t map_len; 1119 ssize_t map_len; 1493 bool recovery = false; !! 1120 pgoff_t pgoff; 1494 void *kaddr; 1121 void *kaddr; 1495 1122 1496 if (fatal_signal_pending(curr 1123 if (fatal_signal_pending(current)) { 1497 ret = -EINTR; 1124 ret = -EINTR; 1498 break; 1125 break; 1499 } 1126 } 1500 1127 >> 1128 ret = bdev_dax_pgoff(bdev, sector, size, &pgoff); >> 1129 if (ret) >> 1130 break; >> 1131 1501 map_len = dax_direct_access(d 1132 map_len = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), 1502 DAX_ACCESS, & !! 1133 &kaddr, NULL); 1503 if (map_len == -EHWPOISON && << 1504 map_len = dax_direct_ << 1505 PHYS_ << 1506 &kadd << 1507 if (map_len > 0) << 1508 recovery = tr << 1509 } << 1510 if (map_len < 0) { 1134 if (map_len < 0) { 1511 ret = dax_mem2blk_err !! 1135 ret = map_len; 1512 break; 1136 break; 1513 } 1137 } 1514 1138 1515 if (cow) { << 1516 ret = dax_iomap_copy_ << 1517 << 1518 if (ret) << 1519 break; << 1520 } << 1521 << 1522 map_len = PFN_PHYS(map_len); 1139 map_len = PFN_PHYS(map_len); 1523 kaddr += offset; 1140 kaddr += offset; 1524 map_len -= offset; 1141 map_len -= offset; 1525 if (map_len > end - pos) 1142 if (map_len > end - pos) 1526 map_len = end - pos; 1143 map_len = end - pos; 1527 1144 1528 if (recovery) !! 1145 /* 1529 xfer = dax_recovery_w !! 1146 * The userspace address for the memory copy has already been 1530 map_l !! 1147 * validated via access_ok() in either vfs_read() or 1531 else if (write) !! 1148 * vfs_write(), depending on which operation we are doing. >> 1149 */ >> 1150 if (iov_iter_rw(iter) == WRITE) 1532 xfer = dax_copy_from_ 1151 xfer = dax_copy_from_iter(dax_dev, pgoff, kaddr, 1533 map_l 1152 map_len, iter); 1534 else 1153 else 1535 xfer = dax_copy_to_it 1154 xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, 1536 map_l 1155 map_len, iter); 1537 1156 1538 pos += xfer; 1157 pos += xfer; 1539 length -= xfer; 1158 length -= xfer; 1540 done += xfer; 1159 done += xfer; 1541 1160 1542 if (xfer == 0) 1161 if (xfer == 0) 1543 ret = -EFAULT; 1162 ret = -EFAULT; 1544 if (xfer < map_len) 1163 if (xfer < map_len) 1545 break; 1164 break; 1546 } 1165 } 1547 dax_read_unlock(id); 1166 dax_read_unlock(id); 1548 1167 1549 return done ? done : ret; 1168 return done ? done : ret; 1550 } 1169 } 1551 1170 1552 /** 1171 /** 1553 * dax_iomap_rw - Perform I/O to a DAX file 1172 * dax_iomap_rw - Perform I/O to a DAX file 1554 * @iocb: The control block for this I/ 1173 * @iocb: The control block for this I/O 1555 * @iter: The addresses to do I/O from 1174 * @iter: The addresses to do I/O from or to 1556 * @ops: iomap ops passed from the fil 1175 * @ops: iomap ops passed from the file system 1557 * 1176 * 1558 * This function performs read and write oper 1177 * This function performs read and write operations to directly mapped 1559 * persistent memory. The callers needs to t 1178 * persistent memory. The callers needs to take care of read/write exclusion 1560 * and evicting any page cache pages in the r 1179 * and evicting any page cache pages in the region under I/O. 1561 */ 1180 */ 1562 ssize_t 1181 ssize_t 1563 dax_iomap_rw(struct kiocb *iocb, struct iov_i 1182 dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, 1564 const struct iomap_ops *ops) 1183 const struct iomap_ops *ops) 1565 { 1184 { 1566 struct iomap_iter iomi = { !! 1185 struct address_space *mapping = iocb->ki_filp->f_mapping; 1567 .inode = iocb->ki_fi !! 1186 struct inode *inode = mapping->host; 1568 .pos = iocb->ki_po !! 1187 loff_t pos = iocb->ki_pos, ret = 0, done = 0; 1569 .len = iov_iter_co !! 1188 unsigned flags = 0; 1570 .flags = IOMAP_DAX, << 1571 }; << 1572 loff_t done = 0; << 1573 int ret; << 1574 << 1575 if (!iomi.len) << 1576 return 0; << 1577 1189 1578 if (iov_iter_rw(iter) == WRITE) { 1190 if (iov_iter_rw(iter) == WRITE) { 1579 lockdep_assert_held_write(&io !! 1191 lockdep_assert_held_write(&inode->i_rwsem); 1580 iomi.flags |= IOMAP_WRITE; !! 1192 flags |= IOMAP_WRITE; 1581 } else { 1193 } else { 1582 lockdep_assert_held(&iomi.ino !! 1194 lockdep_assert_held(&inode->i_rwsem); 1583 } 1195 } 1584 1196 1585 if (iocb->ki_flags & IOCB_NOWAIT) 1197 if (iocb->ki_flags & IOCB_NOWAIT) 1586 iomi.flags |= IOMAP_NOWAIT; !! 1198 flags |= IOMAP_NOWAIT; 1587 1199 1588 while ((ret = iomap_iter(&iomi, ops)) !! 1200 while (iov_iter_count(iter)) { 1589 iomi.processed = dax_iomap_it !! 1201 ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops, >> 1202 iter, dax_iomap_actor); >> 1203 if (ret <= 0) >> 1204 break; >> 1205 pos += ret; >> 1206 done += ret; >> 1207 } 1590 1208 1591 done = iomi.pos - iocb->ki_pos; !! 1209 iocb->ki_pos += done; 1592 iocb->ki_pos = iomi.pos; << 1593 return done ? done : ret; 1210 return done ? done : ret; 1594 } 1211 } 1595 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1212 EXPORT_SYMBOL_GPL(dax_iomap_rw); 1596 1213 1597 static vm_fault_t dax_fault_return(int error) 1214 static vm_fault_t dax_fault_return(int error) 1598 { 1215 { 1599 if (error == 0) 1216 if (error == 0) 1600 return VM_FAULT_NOPAGE; 1217 return VM_FAULT_NOPAGE; 1601 return vmf_error(error); 1218 return vmf_error(error); 1602 } 1219 } 1603 1220 1604 /* 1221 /* 1605 * When handling a synchronous page fault and !! 1222 * MAP_SYNC on a dax mapping guarantees dirty metadata is 1606 * insert the PTE/PMD into page tables only a !! 1223 * flushed on write-faults (non-cow), but not read-faults. 1607 * insertion for now and return the pfn so th << 1608 * fsync is done. << 1609 */ 1224 */ 1610 static vm_fault_t dax_fault_synchronous_pfnp( !! 1225 static bool dax_fault_is_synchronous(unsigned long flags, 1611 { !! 1226 struct vm_area_struct *vma, struct iomap *iomap) 1612 if (WARN_ON_ONCE(!pfnp)) << 1613 return VM_FAULT_SIGBUS; << 1614 *pfnp = pfn; << 1615 return VM_FAULT_NEEDDSYNC; << 1616 } << 1617 << 1618 static vm_fault_t dax_fault_cow_page(struct v << 1619 const struct iomap_iter *iter << 1620 { 1227 { 1621 vm_fault_t ret; !! 1228 return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC) 1622 int error = 0; !! 1229 && (iomap->flags & IOMAP_F_DIRTY); 1623 << 1624 switch (iter->iomap.type) { << 1625 case IOMAP_HOLE: << 1626 case IOMAP_UNWRITTEN: << 1627 clear_user_highpage(vmf->cow_ << 1628 break; << 1629 case IOMAP_MAPPED: << 1630 error = copy_cow_page_dax(vmf << 1631 break; << 1632 default: << 1633 WARN_ON_ONCE(1); << 1634 error = -EIO; << 1635 break; << 1636 } << 1637 << 1638 if (error) << 1639 return dax_fault_return(error << 1640 << 1641 __SetPageUptodate(vmf->cow_page); << 1642 ret = finish_fault(vmf); << 1643 if (!ret) << 1644 return VM_FAULT_DONE_COW; << 1645 return ret; << 1646 } << 1647 << 1648 /** << 1649 * dax_fault_iter - Common actor to handle pf << 1650 * @vmf: vm fault instance << 1651 * @iter: iomap iter << 1652 * @pfnp: pfn to be returned << 1653 * @xas: the dax mapping tree of a fil << 1654 * @entry: an unlocked dax entry to be i << 1655 * @pmd: distinguish whether it is a p << 1656 */ << 1657 static vm_fault_t dax_fault_iter(struct vm_fa << 1658 const struct iomap_iter *iter << 1659 struct xa_state *xas, void ** << 1660 { << 1661 const struct iomap *iomap = &iter->io << 1662 const struct iomap *srcmap = iomap_it << 1663 size_t size = pmd ? PMD_SIZE : PAGE_S << 1664 loff_t pos = (loff_t)xas->xa_index << << 1665 bool write = iter->flags & IOMAP_WRIT << 1666 unsigned long entry_flags = pmd ? DAX << 1667 int err = 0; << 1668 pfn_t pfn; << 1669 void *kaddr; << 1670 << 1671 if (!pmd && vmf->cow_page) << 1672 return dax_fault_cow_page(vmf << 1673 << 1674 /* if we are reading UNWRITTEN and HO << 1675 if (!write && << 1676 (iomap->type == IOMAP_UNWRITTEN | << 1677 if (!pmd) << 1678 return dax_load_hole( << 1679 return dax_pmd_load_hole(xas, << 1680 } << 1681 << 1682 if (iomap->type != IOMAP_MAPPED && !( << 1683 WARN_ON_ONCE(1); << 1684 return pmd ? VM_FAULT_FALLBAC << 1685 } << 1686 << 1687 err = dax_iomap_direct_access(iomap, << 1688 if (err) << 1689 return pmd ? VM_FAULT_FALLBAC << 1690 << 1691 *entry = dax_insert_entry(xas, vmf, i << 1692 << 1693 if (write && iomap->flags & IOMAP_F_S << 1694 err = dax_iomap_copy_around(p << 1695 if (err) << 1696 return dax_fault_retu << 1697 } << 1698 << 1699 if (dax_fault_is_synchronous(iter, vm << 1700 return dax_fault_synchronous_ << 1701 << 1702 /* insert PMD pfn */ << 1703 if (pmd) << 1704 return vmf_insert_pfn_pmd(vmf << 1705 << 1706 /* insert PTE pfn */ << 1707 if (write) << 1708 return vmf_insert_mixed_mkwri << 1709 return vmf_insert_mixed(vmf->vma, vmf << 1710 } 1230 } 1711 1231 1712 static vm_fault_t dax_iomap_pte_fault(struct 1232 static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, 1713 int *iomap_err 1233 int *iomap_errp, const struct iomap_ops *ops) 1714 { 1234 { 1715 struct address_space *mapping = vmf-> !! 1235 struct vm_area_struct *vma = vmf->vma; >> 1236 struct address_space *mapping = vma->vm_file->f_mapping; 1716 XA_STATE(xas, &mapping->i_pages, vmf- 1237 XA_STATE(xas, &mapping->i_pages, vmf->pgoff); 1717 struct iomap_iter iter = { !! 1238 struct inode *inode = mapping->host; 1718 .inode = mapping->ho !! 1239 unsigned long vaddr = vmf->address; 1719 .pos = (loff_t)vmf !! 1240 loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; 1720 .len = PAGE_SIZE, !! 1241 struct iomap iomap = { .type = IOMAP_HOLE }; 1721 .flags = IOMAP_DAX | !! 1242 struct iomap srcmap = { .type = IOMAP_HOLE }; 1722 }; !! 1243 unsigned flags = IOMAP_FAULT; >> 1244 int error, major = 0; >> 1245 bool write = vmf->flags & FAULT_FLAG_WRITE; >> 1246 bool sync; 1723 vm_fault_t ret = 0; 1247 vm_fault_t ret = 0; 1724 void *entry; 1248 void *entry; 1725 int error; !! 1249 pfn_t pfn; 1726 1250 1727 trace_dax_pte_fault(iter.inode, vmf, !! 1251 trace_dax_pte_fault(inode, vmf, ret); 1728 /* 1252 /* 1729 * Check whether offset isn't beyond 1253 * Check whether offset isn't beyond end of file now. Caller is supposed 1730 * to hold locks serializing us with 1254 * to hold locks serializing us with truncate / punch hole so this is 1731 * a reliable test. 1255 * a reliable test. 1732 */ 1256 */ 1733 if (iter.pos >= i_size_read(iter.inod !! 1257 if (pos >= i_size_read(inode)) { 1734 ret = VM_FAULT_SIGBUS; 1258 ret = VM_FAULT_SIGBUS; 1735 goto out; 1259 goto out; 1736 } 1260 } 1737 1261 1738 if ((vmf->flags & FAULT_FLAG_WRITE) & !! 1262 if (write && !vmf->cow_page) 1739 iter.flags |= IOMAP_WRITE; !! 1263 flags |= IOMAP_WRITE; 1740 1264 1741 entry = grab_mapping_entry(&xas, mapp 1265 entry = grab_mapping_entry(&xas, mapping, 0); 1742 if (xa_is_internal(entry)) { 1266 if (xa_is_internal(entry)) { 1743 ret = xa_to_internal(entry); 1267 ret = xa_to_internal(entry); 1744 goto out; 1268 goto out; 1745 } 1269 } 1746 1270 1747 /* 1271 /* 1748 * It is possible, particularly with 1272 * It is possible, particularly with mixed reads & writes to private 1749 * mappings, that we have raced with 1273 * mappings, that we have raced with a PMD fault that overlaps with 1750 * the PTE we need to set up. If so 1274 * the PTE we need to set up. If so just return and the fault will be 1751 * retried. 1275 * retried. 1752 */ 1276 */ 1753 if (pmd_trans_huge(*vmf->pmd) || pmd_ 1277 if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { 1754 ret = VM_FAULT_NOPAGE; 1278 ret = VM_FAULT_NOPAGE; 1755 goto unlock_entry; 1279 goto unlock_entry; 1756 } 1280 } 1757 1281 1758 while ((error = iomap_iter(&iter, ops !! 1282 /* 1759 if (WARN_ON_ONCE(iomap_length !! 1283 * Note that we don't bother to use iomap_apply here: DAX required 1760 iter.processed = -EIO !! 1284 * the file system block size to be equal the page size, which means 1761 continue; !! 1285 * that we never have to deal with more than a single extent here. >> 1286 */ >> 1287 error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap, &srcmap); >> 1288 if (iomap_errp) >> 1289 *iomap_errp = error; >> 1290 if (error) { >> 1291 ret = dax_fault_return(error); >> 1292 goto unlock_entry; >> 1293 } >> 1294 if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { >> 1295 error = -EIO; /* fs corruption? */ >> 1296 goto error_finish_iomap; >> 1297 } >> 1298 >> 1299 if (vmf->cow_page) { >> 1300 sector_t sector = dax_iomap_sector(&iomap, pos); >> 1301 >> 1302 switch (iomap.type) { >> 1303 case IOMAP_HOLE: >> 1304 case IOMAP_UNWRITTEN: >> 1305 clear_user_highpage(vmf->cow_page, vaddr); >> 1306 break; >> 1307 case IOMAP_MAPPED: >> 1308 error = copy_user_dax(iomap.bdev, iomap.dax_dev, >> 1309 sector, PAGE_SIZE, vmf->cow_page, vaddr); >> 1310 break; >> 1311 default: >> 1312 WARN_ON_ONCE(1); >> 1313 error = -EIO; >> 1314 break; 1762 } 1315 } 1763 1316 1764 ret = dax_fault_iter(vmf, &it !! 1317 if (error) 1765 if (ret != VM_FAULT_SIGBUS && !! 1318 goto error_finish_iomap; 1766 (iter.iomap.flags & IOMAP !! 1319 >> 1320 __SetPageUptodate(vmf->cow_page); >> 1321 ret = finish_fault(vmf); >> 1322 if (!ret) >> 1323 ret = VM_FAULT_DONE_COW; >> 1324 goto finish_iomap; >> 1325 } >> 1326 >> 1327 sync = dax_fault_is_synchronous(flags, vma, &iomap); >> 1328 >> 1329 switch (iomap.type) { >> 1330 case IOMAP_MAPPED: >> 1331 if (iomap.flags & IOMAP_F_NEW) { 1767 count_vm_event(PGMAJF 1332 count_vm_event(PGMAJFAULT); 1768 count_memcg_event_mm( !! 1333 count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); 1769 ret |= VM_FAULT_MAJOR !! 1334 major = VM_FAULT_MAJOR; >> 1335 } >> 1336 error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); >> 1337 if (error < 0) >> 1338 goto error_finish_iomap; >> 1339 >> 1340 entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, >> 1341 0, write && !sync); >> 1342 >> 1343 /* >> 1344 * If we are doing synchronous page fault and inode needs fsync, >> 1345 * we can insert PTE into page tables only after that happens. >> 1346 * Skip insertion for now and return the pfn so that caller can >> 1347 * insert it after fsync is done. >> 1348 */ >> 1349 if (sync) { >> 1350 if (WARN_ON_ONCE(!pfnp)) { >> 1351 error = -EIO; >> 1352 goto error_finish_iomap; >> 1353 } >> 1354 *pfnp = pfn; >> 1355 ret = VM_FAULT_NEEDDSYNC | major; >> 1356 goto finish_iomap; 1770 } 1357 } >> 1358 trace_dax_insert_mapping(inode, vmf, entry); >> 1359 if (write) >> 1360 ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); >> 1361 else >> 1362 ret = vmf_insert_mixed(vma, vaddr, pfn); 1771 1363 1772 if (!(ret & VM_FAULT_ERROR)) !! 1364 goto finish_iomap; 1773 iter.processed = PAGE !! 1365 case IOMAP_UNWRITTEN: >> 1366 case IOMAP_HOLE: >> 1367 if (!write) { >> 1368 ret = dax_load_hole(&xas, mapping, &entry, vmf); >> 1369 goto finish_iomap; >> 1370 } >> 1371 /*FALLTHRU*/ >> 1372 default: >> 1373 WARN_ON_ONCE(1); >> 1374 error = -EIO; >> 1375 break; 1774 } 1376 } 1775 1377 1776 if (iomap_errp) !! 1378 error_finish_iomap: 1777 *iomap_errp = error; !! 1379 ret = dax_fault_return(error); 1778 if (!ret && error) !! 1380 finish_iomap: 1779 ret = dax_fault_return(error) !! 1381 if (ops->iomap_end) { >> 1382 int copied = PAGE_SIZE; 1780 1383 1781 unlock_entry: !! 1384 if (ret & VM_FAULT_ERROR) >> 1385 copied = 0; >> 1386 /* >> 1387 * The fault is done by now and there's no way back (other >> 1388 * thread may be already happily using PTE we have installed). >> 1389 * Just ignore error from ->iomap_end since we cannot do much >> 1390 * with it. >> 1391 */ >> 1392 ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); >> 1393 } >> 1394 unlock_entry: 1782 dax_unlock_entry(&xas, entry); 1395 dax_unlock_entry(&xas, entry); 1783 out: !! 1396 out: 1784 trace_dax_pte_fault_done(iter.inode, !! 1397 trace_dax_pte_fault_done(inode, vmf, ret); 1785 return ret; !! 1398 return ret | major; 1786 } 1399 } 1787 1400 1788 #ifdef CONFIG_FS_DAX_PMD 1401 #ifdef CONFIG_FS_DAX_PMD 1789 static bool dax_fault_check_fallback(struct v !! 1402 static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, 1790 pgoff_t max_pgoff) !! 1403 struct iomap *iomap, void **entry) 1791 { 1404 { >> 1405 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1792 unsigned long pmd_addr = vmf->address 1406 unsigned long pmd_addr = vmf->address & PMD_MASK; 1793 bool write = vmf->flags & FAULT_FLAG_ !! 1407 struct vm_area_struct *vma = vmf->vma; >> 1408 struct inode *inode = mapping->host; >> 1409 pgtable_t pgtable = NULL; >> 1410 struct page *zero_page; >> 1411 spinlock_t *ptl; >> 1412 pmd_t pmd_entry; >> 1413 pfn_t pfn; 1794 1414 1795 /* !! 1415 zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); 1796 * Make sure that the faulting addres << 1797 * the PMD offset from the start of t << 1798 * that a PMD range in the page table << 1799 * range in the page cache. << 1800 */ << 1801 if ((vmf->pgoff & PG_PMD_COLOUR) != << 1802 ((vmf->address >> PAGE_SHIFT) & P << 1803 return true; << 1804 1416 1805 /* Fall back to PTEs if we're going t !! 1417 if (unlikely(!zero_page)) 1806 if (write && !(vmf->vma->vm_flags & V !! 1418 goto fallback; 1807 return true; << 1808 1419 1809 /* If the PMD would extend outside th !! 1420 pfn = page_to_pfn_t(zero_page); 1810 if (pmd_addr < vmf->vma->vm_start) !! 1421 *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, 1811 return true; !! 1422 DAX_PMD | DAX_ZERO_PAGE, false); 1812 if ((pmd_addr + PMD_SIZE) > vmf->vma- << 1813 return true; << 1814 1423 1815 /* If the PMD would extend beyond the !! 1424 if (arch_needs_pgtable_deposit()) { 1816 if ((xas->xa_index | PG_PMD_COLOUR) > !! 1425 pgtable = pte_alloc_one(vma->vm_mm); 1817 return true; !! 1426 if (!pgtable) >> 1427 return VM_FAULT_OOM; >> 1428 } >> 1429 >> 1430 ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); >> 1431 if (!pmd_none(*(vmf->pmd))) { >> 1432 spin_unlock(ptl); >> 1433 goto fallback; >> 1434 } 1818 1435 1819 return false; !! 1436 if (pgtable) { >> 1437 pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); >> 1438 mm_inc_nr_ptes(vma->vm_mm); >> 1439 } >> 1440 pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); >> 1441 pmd_entry = pmd_mkhuge(pmd_entry); >> 1442 set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); >> 1443 spin_unlock(ptl); >> 1444 trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); >> 1445 return VM_FAULT_NOPAGE; >> 1446 >> 1447 fallback: >> 1448 if (pgtable) >> 1449 pte_free(vma->vm_mm, pgtable); >> 1450 trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); >> 1451 return VM_FAULT_FALLBACK; 1820 } 1452 } 1821 1453 1822 static vm_fault_t dax_iomap_pmd_fault(struct 1454 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, 1823 const struct i 1455 const struct iomap_ops *ops) 1824 { 1456 { 1825 struct address_space *mapping = vmf-> !! 1457 struct vm_area_struct *vma = vmf->vma; >> 1458 struct address_space *mapping = vma->vm_file->f_mapping; 1826 XA_STATE_ORDER(xas, &mapping->i_pages 1459 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, PMD_ORDER); 1827 struct iomap_iter iter = { !! 1460 unsigned long pmd_addr = vmf->address & PMD_MASK; 1828 .inode = mapping->ho !! 1461 bool write = vmf->flags & FAULT_FLAG_WRITE; 1829 .len = PMD_SIZE, !! 1462 bool sync; 1830 .flags = IOMAP_DAX | !! 1463 unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; 1831 }; !! 1464 struct inode *inode = mapping->host; 1832 vm_fault_t ret = VM_FAULT_FALLBACK; !! 1465 vm_fault_t result = VM_FAULT_FALLBACK; >> 1466 struct iomap iomap = { .type = IOMAP_HOLE }; >> 1467 struct iomap srcmap = { .type = IOMAP_HOLE }; 1833 pgoff_t max_pgoff; 1468 pgoff_t max_pgoff; 1834 void *entry; 1469 void *entry; 1835 !! 1470 loff_t pos; 1836 if (vmf->flags & FAULT_FLAG_WRITE) !! 1471 int error; 1837 iter.flags |= IOMAP_WRITE; !! 1472 pfn_t pfn; 1838 1473 1839 /* 1474 /* 1840 * Check whether offset isn't beyond 1475 * Check whether offset isn't beyond end of file now. Caller is 1841 * supposed to hold locks serializing 1476 * supposed to hold locks serializing us with truncate / punch hole so 1842 * this is a reliable test. 1477 * this is a reliable test. 1843 */ 1478 */ 1844 max_pgoff = DIV_ROUND_UP(i_size_read( !! 1479 max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); 1845 1480 1846 trace_dax_pmd_fault(iter.inode, vmf, !! 1481 trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); >> 1482 >> 1483 /* >> 1484 * Make sure that the faulting address's PMD offset (color) matches >> 1485 * the PMD offset from the start of the file. This is necessary so >> 1486 * that a PMD range in the page table overlaps exactly with a PMD >> 1487 * range in the page cache. >> 1488 */ >> 1489 if ((vmf->pgoff & PG_PMD_COLOUR) != >> 1490 ((vmf->address >> PAGE_SHIFT) & PG_PMD_COLOUR)) >> 1491 goto fallback; >> 1492 >> 1493 /* Fall back to PTEs if we're going to COW */ >> 1494 if (write && !(vma->vm_flags & VM_SHARED)) >> 1495 goto fallback; >> 1496 >> 1497 /* If the PMD would extend outside the VMA */ >> 1498 if (pmd_addr < vma->vm_start) >> 1499 goto fallback; >> 1500 if ((pmd_addr + PMD_SIZE) > vma->vm_end) >> 1501 goto fallback; 1847 1502 1848 if (xas.xa_index >= max_pgoff) { 1503 if (xas.xa_index >= max_pgoff) { 1849 ret = VM_FAULT_SIGBUS; !! 1504 result = VM_FAULT_SIGBUS; 1850 goto out; 1505 goto out; 1851 } 1506 } 1852 1507 1853 if (dax_fault_check_fallback(vmf, &xa !! 1508 /* If the PMD would extend beyond the file size */ >> 1509 if ((xas.xa_index | PG_PMD_COLOUR) >= max_pgoff) 1854 goto fallback; 1510 goto fallback; 1855 1511 1856 /* 1512 /* 1857 * grab_mapping_entry() will make sur 1513 * grab_mapping_entry() will make sure we get an empty PMD entry, 1858 * a zero PMD entry or a DAX PMD. If 1514 * a zero PMD entry or a DAX PMD. If it can't (because a PTE 1859 * entry is already in the array, for 1515 * entry is already in the array, for instance), it will return 1860 * VM_FAULT_FALLBACK. 1516 * VM_FAULT_FALLBACK. 1861 */ 1517 */ 1862 entry = grab_mapping_entry(&xas, mapp 1518 entry = grab_mapping_entry(&xas, mapping, PMD_ORDER); 1863 if (xa_is_internal(entry)) { 1519 if (xa_is_internal(entry)) { 1864 ret = xa_to_internal(entry); !! 1520 result = xa_to_internal(entry); 1865 goto fallback; 1521 goto fallback; 1866 } 1522 } 1867 1523 1868 /* 1524 /* 1869 * It is possible, particularly with 1525 * It is possible, particularly with mixed reads & writes to private 1870 * mappings, that we have raced with 1526 * mappings, that we have raced with a PTE fault that overlaps with 1871 * the PMD we need to set up. If so 1527 * the PMD we need to set up. If so just return and the fault will be 1872 * retried. 1528 * retried. 1873 */ 1529 */ 1874 if (!pmd_none(*vmf->pmd) && !pmd_tran 1530 if (!pmd_none(*vmf->pmd) && !pmd_trans_huge(*vmf->pmd) && 1875 !pmd_devmap(*vmf->pmd 1531 !pmd_devmap(*vmf->pmd)) { 1876 ret = 0; !! 1532 result = 0; 1877 goto unlock_entry; 1533 goto unlock_entry; 1878 } 1534 } 1879 1535 1880 iter.pos = (loff_t)xas.xa_index << PA !! 1536 /* 1881 while (iomap_iter(&iter, ops) > 0) { !! 1537 * Note that we don't use iomap_apply here. We aren't doing I/O, only 1882 if (iomap_length(&iter) < PMD !! 1538 * setting up a mapping, so really we're using iomap_begin() as a way 1883 continue; /* actually !! 1539 * to look up our filesystem block. >> 1540 */ >> 1541 pos = (loff_t)xas.xa_index << PAGE_SHIFT; >> 1542 error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap, >> 1543 &srcmap); >> 1544 if (error) >> 1545 goto unlock_entry; >> 1546 >> 1547 if (iomap.offset + iomap.length < pos + PMD_SIZE) >> 1548 goto finish_iomap; >> 1549 >> 1550 sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap); 1884 1551 1885 ret = dax_fault_iter(vmf, &it !! 1552 switch (iomap.type) { 1886 if (ret != VM_FAULT_FALLBACK) !! 1553 case IOMAP_MAPPED: 1887 iter.processed = PMD_ !! 1554 error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn); >> 1555 if (error < 0) >> 1556 goto finish_iomap; >> 1557 >> 1558 entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, >> 1559 DAX_PMD, write && !sync); >> 1560 >> 1561 /* >> 1562 * If we are doing synchronous page fault and inode needs fsync, >> 1563 * we can insert PMD into page tables only after that happens. >> 1564 * Skip insertion for now and return the pfn so that caller can >> 1565 * insert it after fsync is done. >> 1566 */ >> 1567 if (sync) { >> 1568 if (WARN_ON_ONCE(!pfnp)) >> 1569 goto finish_iomap; >> 1570 *pfnp = pfn; >> 1571 result = VM_FAULT_NEEDDSYNC; >> 1572 goto finish_iomap; >> 1573 } >> 1574 >> 1575 trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry); >> 1576 result = vmf_insert_pfn_pmd(vmf, pfn, write); >> 1577 break; >> 1578 case IOMAP_UNWRITTEN: >> 1579 case IOMAP_HOLE: >> 1580 if (WARN_ON_ONCE(write)) >> 1581 break; >> 1582 result = dax_pmd_load_hole(&xas, vmf, &iomap, &entry); >> 1583 break; >> 1584 default: >> 1585 WARN_ON_ONCE(1); >> 1586 break; 1888 } 1587 } 1889 1588 1890 unlock_entry: !! 1589 finish_iomap: >> 1590 if (ops->iomap_end) { >> 1591 int copied = PMD_SIZE; >> 1592 >> 1593 if (result == VM_FAULT_FALLBACK) >> 1594 copied = 0; >> 1595 /* >> 1596 * The fault is done by now and there's no way back (other >> 1597 * thread may be already happily using PMD we have installed). >> 1598 * Just ignore error from ->iomap_end since we cannot do much >> 1599 * with it. >> 1600 */ >> 1601 ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags, >> 1602 &iomap); >> 1603 } >> 1604 unlock_entry: 1891 dax_unlock_entry(&xas, entry); 1605 dax_unlock_entry(&xas, entry); 1892 fallback: !! 1606 fallback: 1893 if (ret == VM_FAULT_FALLBACK) { !! 1607 if (result == VM_FAULT_FALLBACK) { 1894 split_huge_pmd(vmf->vma, vmf- !! 1608 split_huge_pmd(vma, vmf->pmd, vmf->address); 1895 count_vm_event(THP_FAULT_FALL 1609 count_vm_event(THP_FAULT_FALLBACK); 1896 } 1610 } 1897 out: 1611 out: 1898 trace_dax_pmd_fault_done(iter.inode, !! 1612 trace_dax_pmd_fault_done(inode, vmf, max_pgoff, result); 1899 return ret; !! 1613 return result; 1900 } 1614 } 1901 #else 1615 #else 1902 static vm_fault_t dax_iomap_pmd_fault(struct 1616 static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, 1903 const struct i 1617 const struct iomap_ops *ops) 1904 { 1618 { 1905 return VM_FAULT_FALLBACK; 1619 return VM_FAULT_FALLBACK; 1906 } 1620 } 1907 #endif /* CONFIG_FS_DAX_PMD */ 1621 #endif /* CONFIG_FS_DAX_PMD */ 1908 1622 1909 /** 1623 /** 1910 * dax_iomap_fault - handle a page fault on a 1624 * dax_iomap_fault - handle a page fault on a DAX file 1911 * @vmf: The description of the fault 1625 * @vmf: The description of the fault 1912 * @order: Order of the page to fault in !! 1626 * @pe_size: Size of the page to fault in 1913 * @pfnp: PFN to insert for synchronous fault 1627 * @pfnp: PFN to insert for synchronous faults if fsync is required 1914 * @iomap_errp: Storage for detailed error co 1628 * @iomap_errp: Storage for detailed error code in case of error 1915 * @ops: Iomap ops passed from the file syste 1629 * @ops: Iomap ops passed from the file system 1916 * 1630 * 1917 * When a page fault occurs, filesystems may 1631 * When a page fault occurs, filesystems may call this helper in 1918 * their fault handler for DAX files. dax_iom 1632 * their fault handler for DAX files. dax_iomap_fault() assumes the caller 1919 * has done all the necessary locking for pag 1633 * has done all the necessary locking for page fault to proceed 1920 * successfully. 1634 * successfully. 1921 */ 1635 */ 1922 vm_fault_t dax_iomap_fault(struct vm_fault *v !! 1636 vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, 1923 pfn_t *pfnp, int *iomap_e 1637 pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) 1924 { 1638 { 1925 if (order == 0) !! 1639 switch (pe_size) { >> 1640 case PE_SIZE_PTE: 1926 return dax_iomap_pte_fault(vm 1641 return dax_iomap_pte_fault(vmf, pfnp, iomap_errp, ops); 1927 else if (order == PMD_ORDER) !! 1642 case PE_SIZE_PMD: 1928 return dax_iomap_pmd_fault(vm 1643 return dax_iomap_pmd_fault(vmf, pfnp, ops); 1929 else !! 1644 default: 1930 return VM_FAULT_FALLBACK; 1645 return VM_FAULT_FALLBACK; >> 1646 } 1931 } 1647 } 1932 EXPORT_SYMBOL_GPL(dax_iomap_fault); 1648 EXPORT_SYMBOL_GPL(dax_iomap_fault); 1933 1649 1934 /* 1650 /* 1935 * dax_insert_pfn_mkwrite - insert PTE or PMD 1651 * dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables 1936 * @vmf: The description of the fault 1652 * @vmf: The description of the fault 1937 * @pfn: PFN to insert 1653 * @pfn: PFN to insert 1938 * @order: Order of entry to insert. 1654 * @order: Order of entry to insert. 1939 * 1655 * 1940 * This function inserts a writeable PTE or P 1656 * This function inserts a writeable PTE or PMD entry into the page tables 1941 * for an mmaped DAX file. It also marks the 1657 * for an mmaped DAX file. It also marks the page cache entry as dirty. 1942 */ 1658 */ 1943 static vm_fault_t 1659 static vm_fault_t 1944 dax_insert_pfn_mkwrite(struct vm_fault *vmf, 1660 dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) 1945 { 1661 { 1946 struct address_space *mapping = vmf-> 1662 struct address_space *mapping = vmf->vma->vm_file->f_mapping; 1947 XA_STATE_ORDER(xas, &mapping->i_pages 1663 XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); 1948 void *entry; 1664 void *entry; 1949 vm_fault_t ret; 1665 vm_fault_t ret; 1950 1666 1951 xas_lock_irq(&xas); 1667 xas_lock_irq(&xas); 1952 entry = get_unlocked_entry(&xas, orde 1668 entry = get_unlocked_entry(&xas, order); 1953 /* Did we race with someone splitting 1669 /* Did we race with someone splitting entry or so? */ 1954 if (!entry || dax_is_conflict(entry) 1670 if (!entry || dax_is_conflict(entry) || 1955 (order == 0 && !dax_is_pte_entry( 1671 (order == 0 && !dax_is_pte_entry(entry))) { 1956 put_unlocked_entry(&xas, entr !! 1672 put_unlocked_entry(&xas, entry); 1957 xas_unlock_irq(&xas); 1673 xas_unlock_irq(&xas); 1958 trace_dax_insert_pfn_mkwrite_ 1674 trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, 1959 1675 VM_FAULT_NOPAGE); 1960 return VM_FAULT_NOPAGE; 1676 return VM_FAULT_NOPAGE; 1961 } 1677 } 1962 xas_set_mark(&xas, PAGECACHE_TAG_DIRT 1678 xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); 1963 dax_lock_entry(&xas, entry); 1679 dax_lock_entry(&xas, entry); 1964 xas_unlock_irq(&xas); 1680 xas_unlock_irq(&xas); 1965 if (order == 0) 1681 if (order == 0) 1966 ret = vmf_insert_mixed_mkwrit 1682 ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); 1967 #ifdef CONFIG_FS_DAX_PMD 1683 #ifdef CONFIG_FS_DAX_PMD 1968 else if (order == PMD_ORDER) 1684 else if (order == PMD_ORDER) 1969 ret = vmf_insert_pfn_pmd(vmf, 1685 ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); 1970 #endif 1686 #endif 1971 else 1687 else 1972 ret = VM_FAULT_FALLBACK; 1688 ret = VM_FAULT_FALLBACK; 1973 dax_unlock_entry(&xas, entry); 1689 dax_unlock_entry(&xas, entry); 1974 trace_dax_insert_pfn_mkwrite(mapping- 1690 trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); 1975 return ret; 1691 return ret; 1976 } 1692 } 1977 1693 1978 /** 1694 /** 1979 * dax_finish_sync_fault - finish synchronous 1695 * dax_finish_sync_fault - finish synchronous page fault 1980 * @vmf: The description of the fault 1696 * @vmf: The description of the fault 1981 * @order: Order of entry to be inserted !! 1697 * @pe_size: Size of entry to be inserted 1982 * @pfn: PFN to insert 1698 * @pfn: PFN to insert 1983 * 1699 * 1984 * This function ensures that the file range 1700 * This function ensures that the file range touched by the page fault is 1985 * stored persistently on the media and handl 1701 * stored persistently on the media and handles inserting of appropriate page 1986 * table entry. 1702 * table entry. 1987 */ 1703 */ 1988 vm_fault_t dax_finish_sync_fault(struct vm_fa !! 1704 vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, 1989 pfn_t pfn) !! 1705 enum page_entry_size pe_size, pfn_t pfn) 1990 { 1706 { 1991 int err; 1707 int err; 1992 loff_t start = ((loff_t)vmf->pgoff) < 1708 loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; >> 1709 unsigned int order = pe_order(pe_size); 1993 size_t len = PAGE_SIZE << order; 1710 size_t len = PAGE_SIZE << order; 1994 1711 1995 err = vfs_fsync_range(vmf->vma->vm_fi 1712 err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1); 1996 if (err) 1713 if (err) 1997 return VM_FAULT_SIGBUS; 1714 return VM_FAULT_SIGBUS; 1998 return dax_insert_pfn_mkwrite(vmf, pf 1715 return dax_insert_pfn_mkwrite(vmf, pfn, order); 1999 } 1716 } 2000 EXPORT_SYMBOL_GPL(dax_finish_sync_fault); 1717 EXPORT_SYMBOL_GPL(dax_finish_sync_fault); 2001 << 2002 static loff_t dax_range_compare_iter(struct i << 2003 struct iomap_iter *it_dest, u << 2004 { << 2005 const struct iomap *smap = &it_src->i << 2006 const struct iomap *dmap = &it_dest-> << 2007 loff_t pos1 = it_src->pos, pos2 = it_ << 2008 void *saddr, *daddr; << 2009 int id, ret; << 2010 << 2011 len = min(len, min(smap->length, dmap << 2012 << 2013 if (smap->type == IOMAP_HOLE && dmap- << 2014 *same = true; << 2015 return len; << 2016 } << 2017 << 2018 if (smap->type == IOMAP_HOLE || dmap- << 2019 *same = false; << 2020 return 0; << 2021 } << 2022 << 2023 id = dax_read_lock(); << 2024 ret = dax_iomap_direct_access(smap, p << 2025 &saddr, << 2026 if (ret < 0) << 2027 goto out_unlock; << 2028 << 2029 ret = dax_iomap_direct_access(dmap, p << 2030 &daddr, << 2031 if (ret < 0) << 2032 goto out_unlock; << 2033 << 2034 *same = !memcmp(saddr, daddr, len); << 2035 if (!*same) << 2036 len = 0; << 2037 dax_read_unlock(id); << 2038 return len; << 2039 << 2040 out_unlock: << 2041 dax_read_unlock(id); << 2042 return -EIO; << 2043 } << 2044 << 2045 int dax_dedupe_file_range_compare(struct inod << 2046 struct inode *dst, loff_t dst << 2047 const struct iomap_ops *ops) << 2048 { << 2049 struct iomap_iter src_iter = { << 2050 .inode = src, << 2051 .pos = srcoff, << 2052 .len = len, << 2053 .flags = IOMAP_DAX, << 2054 }; << 2055 struct iomap_iter dst_iter = { << 2056 .inode = dst, << 2057 .pos = dstoff, << 2058 .len = len, << 2059 .flags = IOMAP_DAX, << 2060 }; << 2061 int ret, compared = 0; << 2062 << 2063 while ((ret = iomap_iter(&src_iter, o << 2064 (ret = iomap_iter(&dst_iter, o << 2065 compared = dax_range_compare_ << 2066 min(src_iter. << 2067 if (compared < 0) << 2068 return ret; << 2069 src_iter.processed = dst_iter << 2070 } << 2071 return ret; << 2072 } << 2073 << 2074 int dax_remap_file_range_prep(struct file *fi << 2075 struct file *fi << 2076 loff_t *len, un << 2077 const struct io << 2078 { << 2079 return __generic_remap_file_range_pre << 2080 << 2081 } << 2082 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep); << 2083 1718
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.