~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/fs/dax.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

Diff markup

Differences between /fs/dax.c (Version linux-6.12-rc7) and /fs/dax.c (Version linux-4.19.322)


** Warning: Cannot open xref database.

  1 // SPDX-License-Identifier: GPL-2.0-only            1 
  2 /*                                                
  3  * fs/dax.c - Direct Access filesystem code       
  4  * Copyright (c) 2013-2014 Intel Corporation      
  5  * Author: Matthew Wilcox <matthew.r.wilcox@in    
  6  * Author: Ross Zwisler <ross.zwisler@linux.in    
  7  */                                               
  8                                                   
  9 #include <linux/atomic.h>                         
 10 #include <linux/blkdev.h>                         
 11 #include <linux/buffer_head.h>                    
 12 #include <linux/dax.h>                            
 13 #include <linux/fs.h>                             
 14 #include <linux/highmem.h>                        
 15 #include <linux/memcontrol.h>                     
 16 #include <linux/mm.h>                             
 17 #include <linux/mutex.h>                          
 18 #include <linux/pagevec.h>                        
 19 #include <linux/sched.h>                          
 20 #include <linux/sched/signal.h>                   
 21 #include <linux/uio.h>                            
 22 #include <linux/vmstat.h>                         
 23 #include <linux/pfn_t.h>                          
 24 #include <linux/sizes.h>                          
 25 #include <linux/mmu_notifier.h>                   
 26 #include <linux/iomap.h>                          
 27 #include <linux/rmap.h>                           
 28 #include <asm/pgalloc.h>                          
 29                                                   
 30 #define CREATE_TRACE_POINTS                       
 31 #include <trace/events/fs_dax.h>                  
 32                                                   
 33 /* We choose 4096 entries - same as per-zone p    
 34 #define DAX_WAIT_TABLE_BITS 12                    
 35 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_    
 36                                                   
 37 /* The 'colour' (ie low bits) within a PMD of     
 38 #define PG_PMD_COLOUR   ((PMD_SIZE >> PAGE_SHI    
 39 #define PG_PMD_NR       (PMD_SIZE >> PAGE_SHIF    
 40                                                   
 41 static wait_queue_head_t wait_table[DAX_WAIT_T    
 42                                                   
 43 static int __init init_dax_wait_table(void)       
 44 {                                                 
 45         int i;                                    
 46                                                   
 47         for (i = 0; i < DAX_WAIT_TABLE_ENTRIES    
 48                 init_waitqueue_head(wait_table    
 49         return 0;                                 
 50 }                                                 
 51 fs_initcall(init_dax_wait_table);                 
 52                                                   
 53 /*                                                
 54  * DAX pagecache entries use XArray value entr    
 55  * for pages.  We use one bit for locking, one    
 56  * and two more to tell us if the entry is a z    
 57  * is just used for locking.  In total four sp    
 58  *                                                
 59  * If the PMD bit isn't set the entry has size    
 60  * and EMPTY bits aren't set the entry is a no    
 61  * block allocation.                              
 62  */                                               
 63 #define DAX_SHIFT       (4)                       
 64 #define DAX_LOCKED      (1UL << 0)                
 65 #define DAX_PMD         (1UL << 1)                
 66 #define DAX_ZERO_PAGE   (1UL << 2)                
 67 #define DAX_EMPTY       (1UL << 3)                
 68                                                   
 69 static unsigned long dax_to_pfn(void *entry)      
 70 {                                                 
 71         return xa_to_value(entry) >> DAX_SHIFT    
 72 }                                                 
 73                                                   
 74 static void *dax_make_entry(pfn_t pfn, unsigne    
 75 {                                                 
 76         return xa_mk_value(flags | (pfn_t_to_p    
 77 }                                                 
 78                                                   
 79 static bool dax_is_locked(void *entry)            
 80 {                                                 
 81         return xa_to_value(entry) & DAX_LOCKED    
 82 }                                                 
 83                                                   
 84 static unsigned int dax_entry_order(void *entr    
 85 {                                                 
 86         if (xa_to_value(entry) & DAX_PMD)         
 87                 return PMD_ORDER;                 
 88         return 0;                                 
 89 }                                                 
 90                                                   
 91 static unsigned long dax_is_pmd_entry(void *en    
 92 {                                                 
 93         return xa_to_value(entry) & DAX_PMD;      
 94 }                                                 
 95                                                   
 96 static bool dax_is_pte_entry(void *entry)         
 97 {                                                 
 98         return !(xa_to_value(entry) & DAX_PMD)    
 99 }                                                 
100                                                   
101 static int dax_is_zero_entry(void *entry)         
102 {                                                 
103         return xa_to_value(entry) & DAX_ZERO_P    
104 }                                                 
105                                                   
106 static int dax_is_empty_entry(void *entry)        
107 {                                                 
108         return xa_to_value(entry) & DAX_EMPTY;    
109 }                                                 
110                                                   
111 /*                                                
112  * true if the entry that was found is of a sm    
113  * we were looking for                            
114  */                                               
115 static bool dax_is_conflict(void *entry)          
116 {                                                 
117         return entry == XA_RETRY_ENTRY;           
118 }                                                 
119                                                   
120 /*                                                
121  * DAX page cache entry locking                   
122  */                                               
123 struct exceptional_entry_key {                    
124         struct xarray *xa;                        
125         pgoff_t entry_start;                      
126 };                                                
127                                                   
128 struct wait_exceptional_entry_queue {             
129         wait_queue_entry_t wait;                  
130         struct exceptional_entry_key key;         
131 };                                                
132                                                   
133 /**                                               
134  * enum dax_wake_mode: waitqueue wakeup behavi    
135  * @WAKE_ALL: wake all waiters in the waitqueu    
136  * @WAKE_NEXT: wake only the first waiter in t    
137  */                                               
138 enum dax_wake_mode {                              
139         WAKE_ALL,                                 
140         WAKE_NEXT,                                
141 };                                                
142                                                   
143 static wait_queue_head_t *dax_entry_waitqueue(    
144                 void *entry, struct exceptiona    
145 {                                                 
146         unsigned long hash;                       
147         unsigned long index = xas->xa_index;      
148                                                   
149         /*                                        
150          * If 'entry' is a PMD, align the 'ind    
151          * queue to the start of that PMD.  Th    
152          * the range covered by the PMD map to    
153          */                                       
154         if (dax_is_pmd_entry(entry))              
155                 index &= ~PG_PMD_COLOUR;          
156         key->xa = xas->xa;                        
157         key->entry_start = index;                 
158                                                   
159         hash = hash_long((unsigned long)xas->x    
160         return wait_table + hash;                 
161 }                                                 
162                                                   
163 static int wake_exceptional_entry_func(wait_qu    
164                 unsigned int mode, int sync, v    
165 {                                                 
166         struct exceptional_entry_key *key = ke    
167         struct wait_exceptional_entry_queue *e    
168                 container_of(wait, struct wait    
169                                                   
170         if (key->xa != ewait->key.xa ||           
171             key->entry_start != ewait->key.ent    
172                 return 0;                         
173         return autoremove_wake_function(wait,     
174 }                                                 
175                                                   
176 /*                                                
177  * @entry may no longer be the entry at the in    
178  * The important information it's conveying is    
179  * this index used to be a PMD entry.             
180  */                                               
181 static void dax_wake_entry(struct xa_state *xa    
182                            enum dax_wake_mode     
183 {                                                 
184         struct exceptional_entry_key key;         
185         wait_queue_head_t *wq;                    
186                                                   
187         wq = dax_entry_waitqueue(xas, entry, &    
188                                                   
189         /*                                        
190          * Checking for locked entry and prepa    
191          * under the i_pages lock, ditto for e    
192          * So at this point all tasks that cou    
193          * must be in the waitqueue and the fo    
194          */                                       
195         if (waitqueue_active(wq))                 
196                 __wake_up(wq, TASK_NORMAL, mod    
197 }                                                 
198                                                   
199 /*                                                
200  * Look up entry in page cache, wait for it to    
201  * is a DAX entry and return it.  The caller m    
202  * put_unlocked_entry() if it did not lock the    
203  * if it did.  The entry returned may have a l    
204  * If @order is larger than the order of the e    
205  * function returns a dax_is_conflict entry.      
206  *                                                
207  * Must be called with the i_pages lock held.     
208  */                                               
209 static void *get_unlocked_entry(struct xa_stat    
210 {                                                 
211         void *entry;                              
212         struct wait_exceptional_entry_queue ew    
213         wait_queue_head_t *wq;                    
214                                                   
215         init_wait(&ewait.wait);                   
216         ewait.wait.func = wake_exceptional_ent    
217                                                   
218         for (;;) {                                
219                 entry = xas_find_conflict(xas)    
220                 if (!entry || WARN_ON_ONCE(!xa    
221                         return entry;             
222                 if (dax_entry_order(entry) < o    
223                         return XA_RETRY_ENTRY;    
224                 if (!dax_is_locked(entry))        
225                         return entry;             
226                                                   
227                 wq = dax_entry_waitqueue(xas,     
228                 prepare_to_wait_exclusive(wq,     
229                                           TASK    
230                 xas_unlock_irq(xas);              
231                 xas_reset(xas);                   
232                 schedule();                       
233                 finish_wait(wq, &ewait.wait);     
234                 xas_lock_irq(xas);                
235         }                                         
236 }                                                 
237                                                   
238 /*                                                
239  * The only thing keeping the address space ar    
240  * (it's cycled in clear_inode() after removin    
241  * After we call xas_unlock_irq(), we cannot t    
242  */                                               
243 static void wait_entry_unlocked(struct xa_stat    
244 {                                                 
245         struct wait_exceptional_entry_queue ew    
246         wait_queue_head_t *wq;                    
247                                                   
248         init_wait(&ewait.wait);                   
249         ewait.wait.func = wake_exceptional_ent    
250                                                   
251         wq = dax_entry_waitqueue(xas, entry, &    
252         /*                                        
253          * Unlike get_unlocked_entry() there i    
254          * path ever successfully retrieves an    
255          * inode dies. Perform a non-exclusive    
256          * never successfully performs its own    
257          */                                       
258         prepare_to_wait(wq, &ewait.wait, TASK_    
259         xas_unlock_irq(xas);                      
260         schedule();                               
261         finish_wait(wq, &ewait.wait);             
262 }                                                 
263                                                   
264 static void put_unlocked_entry(struct xa_state    
265                                enum dax_wake_m    
266 {                                                 
267         if (entry && !dax_is_conflict(entry))     
268                 dax_wake_entry(xas, entry, mod    
269 }                                                 
270                                                   
271 /*                                                
272  * We used the xa_state to get the entry, but     
273  * dropped the xa_lock, so we know the xa_stat    
274  * before use.                                    
275  */                                               
276 static void dax_unlock_entry(struct xa_state *    
277 {                                                 
278         void *old;                                
279                                                   
280         BUG_ON(dax_is_locked(entry));             
281         xas_reset(xas);                           
282         xas_lock_irq(xas);                        
283         old = xas_store(xas, entry);              
284         xas_unlock_irq(xas);                      
285         BUG_ON(!dax_is_locked(old));              
286         dax_wake_entry(xas, entry, WAKE_NEXT);    
287 }                                                 
288                                                   
289 /*                                                
290  * Return: The entry stored at this location b    
291  */                                               
292 static void *dax_lock_entry(struct xa_state *x    
293 {                                                 
294         unsigned long v = xa_to_value(entry);     
295         return xas_store(xas, xa_mk_value(v |     
296 }                                                 
297                                                   
298 static unsigned long dax_entry_size(void *entr    
299 {                                                 
300         if (dax_is_zero_entry(entry))             
301                 return 0;                         
302         else if (dax_is_empty_entry(entry))       
303                 return 0;                         
304         else if (dax_is_pmd_entry(entry))         
305                 return PMD_SIZE;                  
306         else                                      
307                 return PAGE_SIZE;                 
308 }                                                 
309                                                   
310 static unsigned long dax_end_pfn(void *entry)     
311 {                                                 
312         return dax_to_pfn(entry) + dax_entry_s    
313 }                                                 
314                                                   
315 /*                                                
316  * Iterate through all mapped pfns represented    
317  * 'empty' and 'zero' entries.                    
318  */                                               
319 #define for_each_mapped_pfn(entry, pfn) \         
320         for (pfn = dax_to_pfn(entry); \           
321                         pfn < dax_end_pfn(entr    
322                                                   
323 static inline bool dax_page_is_shared(struct p    
324 {                                                 
325         return page->mapping == PAGE_MAPPING_D    
326 }                                                 
327                                                   
328 /*                                                
329  * Set the page->mapping with PAGE_MAPPING_DAX    
330  * refcount.                                      
331  */                                               
332 static inline void dax_page_share_get(struct p    
333 {                                                 
334         if (page->mapping != PAGE_MAPPING_DAX_    
335                 /*                                
336                  * Reset the index if the page    
337                  * regularly before.              
338                  */                               
339                 if (page->mapping)                
340                         page->share = 1;          
341                 page->mapping = PAGE_MAPPING_D    
342         }                                         
343         page->share++;                            
344 }                                                 
345                                                   
346 static inline unsigned long dax_page_share_put    
347 {                                                 
348         return --page->share;                     
349 }                                                 
350                                                   
351 /*                                                
352  * When it is called in dax_insert_entry(), th    
353  * whether this entry is shared by multiple fi    
354  * PAGE_MAPPING_DAX_SHARED, and use page->shar    
355  */                                               
356 static void dax_associate_entry(void *entry, s    
357                 struct vm_area_struct *vma, un    
358 {                                                 
359         unsigned long size = dax_entry_size(en    
360         int i = 0;                                
361                                                   
362         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))    
363                 return;                           
364                                                   
365         index = linear_page_index(vma, address    
366         for_each_mapped_pfn(entry, pfn) {         
367                 struct page *page = pfn_to_pag    
368                                                   
369                 if (shared) {                     
370                         dax_page_share_get(pag    
371                 } else {                          
372                         WARN_ON_ONCE(page->map    
373                         page->mapping = mappin    
374                         page->index = index +     
375                 }                                 
376         }                                         
377 }                                                 
378                                                   
379 static void dax_disassociate_entry(void *entry    
380                 bool trunc)                       
381 {                                                 
382         unsigned long pfn;                        
383                                                   
384         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))    
385                 return;                           
386                                                   
387         for_each_mapped_pfn(entry, pfn) {         
388                 struct page *page = pfn_to_pag    
389                                                   
390                 WARN_ON_ONCE(trunc && page_ref    
391                 if (dax_page_is_shared(page))     
392                         /* keep the shared fla    
393                         if (dax_page_share_put    
394                                 continue;         
395                 } else                            
396                         WARN_ON_ONCE(page->map    
397                 page->mapping = NULL;             
398                 page->index = 0;                  
399         }                                         
400 }                                                 
401                                                   
402 static struct page *dax_busy_page(void *entry)    
403 {                                                 
404         unsigned long pfn;                        
405                                                   
406         for_each_mapped_pfn(entry, pfn) {         
407                 struct page *page = pfn_to_pag    
408                                                   
409                 if (page_ref_count(page) > 1)     
410                         return page;              
411         }                                         
412         return NULL;                              
413 }                                                 
414                                                   
415 /**                                               
416  * dax_lock_folio - Lock the DAX entry corresp    
417  * @folio: The folio whose entry we want to lo    
418  *                                                
419  * Context: Process context.                      
420  * Return: A cookie to pass to dax_unlock_foli    
421  * not be locked.                                 
422  */                                               
423 dax_entry_t dax_lock_folio(struct folio *folio    
424 {                                                 
425         XA_STATE(xas, NULL, 0);                   
426         void *entry;                              
427                                                   
428         /* Ensure folio->mapping isn't freed w    
429         rcu_read_lock();                          
430         for (;;) {                                
431                 struct address_space *mapping     
432                                                   
433                 entry = NULL;                     
434                 if (!mapping || !dax_mapping(m    
435                         break;                    
436                                                   
437                 /*                                
438                  * In the device-dax case ther    
439                  * struct dev_pagemap pin is s    
440                  * inode alive, and we assume     
441                  * otherwise we would not have    
442                  * translation.                   
443                  */                               
444                 entry = (void *)~0UL;             
445                 if (S_ISCHR(mapping->host->i_m    
446                         break;                    
447                                                   
448                 xas.xa = &mapping->i_pages;       
449                 xas_lock_irq(&xas);               
450                 if (mapping != folio->mapping)    
451                         xas_unlock_irq(&xas);     
452                         continue;                 
453                 }                                 
454                 xas_set(&xas, folio->index);      
455                 entry = xas_load(&xas);           
456                 if (dax_is_locked(entry)) {       
457                         rcu_read_unlock();        
458                         wait_entry_unlocked(&x    
459                         rcu_read_lock();          
460                         continue;                 
461                 }                                 
462                 dax_lock_entry(&xas, entry);      
463                 xas_unlock_irq(&xas);             
464                 break;                            
465         }                                         
466         rcu_read_unlock();                        
467         return (dax_entry_t)entry;                
468 }                                                 
469                                                   
470 void dax_unlock_folio(struct folio *folio, dax    
471 {                                                 
472         struct address_space *mapping = folio-    
473         XA_STATE(xas, &mapping->i_pages, folio    
474                                                   
475         if (S_ISCHR(mapping->host->i_mode))       
476                 return;                           
477                                                   
478         dax_unlock_entry(&xas, (void *)cookie)    
479 }                                                 
480                                                   
481 /*                                                
482  * dax_lock_mapping_entry - Lock the DAX entry    
483  * @mapping: the file's mapping whose entry we    
484  * @index: the offset within this file            
485  * @page: output the dax page corresponding to    
486  *                                                
487  * Return: A cookie to pass to dax_unlock_mapp    
488  * could not be locked.                           
489  */                                               
490 dax_entry_t dax_lock_mapping_entry(struct addr    
491                 struct page **page)               
492 {                                                 
493         XA_STATE(xas, NULL, 0);                   
494         void *entry;                              
495                                                   
496         rcu_read_lock();                          
497         for (;;) {                                
498                 entry = NULL;                     
499                 if (!dax_mapping(mapping))        
500                         break;                    
501                                                   
502                 xas.xa = &mapping->i_pages;       
503                 xas_lock_irq(&xas);               
504                 xas_set(&xas, index);             
505                 entry = xas_load(&xas);           
506                 if (dax_is_locked(entry)) {       
507                         rcu_read_unlock();        
508                         wait_entry_unlocked(&x    
509                         rcu_read_lock();          
510                         continue;                 
511                 }                                 
512                 if (!entry ||                     
513                     dax_is_zero_entry(entry) |    
514                         /*                        
515                          * Because we are look    
516                          * and index, so the e    
517                          * or even a zero/empt    
518                          * an error case.  So,    
519                          * not output @page.      
520                          */                       
521                         entry = (void *)~0UL;     
522                 } else {                          
523                         *page = pfn_to_page(da    
524                         dax_lock_entry(&xas, e    
525                 }                                 
526                 xas_unlock_irq(&xas);             
527                 break;                            
528         }                                         
529         rcu_read_unlock();                        
530         return (dax_entry_t)entry;                
531 }                                                 
532                                                   
533 void dax_unlock_mapping_entry(struct address_s    
534                 dax_entry_t cookie)               
535 {                                                 
536         XA_STATE(xas, &mapping->i_pages, index    
537                                                   
538         if (cookie == ~0UL)                       
539                 return;                           
540                                                   
541         dax_unlock_entry(&xas, (void *)cookie)    
542 }                                                 
543                                                   
544 /*                                                
545  * Find page cache entry at given index. If it    
546  * with the entry locked. If the page cache do    
547  * that index, add a locked empty entry.          
548  *                                                
549  * When requesting an entry with size DAX_PMD,    
550  * either return that locked entry or will ret    
551  * This will happen if there are any PTE entri    
552  * that we are requesting.                        
553  *                                                
554  * We always favor PTE entries over PMD entrie    
555  * evict PTE entries in order to 'upgrade' the    
556  * insertion will fail if it finds any PTE ent    
557  * PTE insertion will cause an existing PMD en    
558  * downgraded to PTE entries.  This happens fo    
559  * well as PMD empty entries.                     
560  *                                                
561  * The exception to this downgrade path is for    
562  * real storage backing them.  We will leave t    
563  * the tree, and PTE writes will simply dirty     
564  *                                                
565  * Note: Unlike filemap_fault() we don't honor    
566  * persistent memory the benefit is doubtful.     
567  * show it helps.                                 
568  *                                                
569  * On error, this function does not return an     
570  * a VM_FAULT code, encoded as an xarray inter    
571  * overlap with xarray value entries.             
572  */                                               
573 static void *grab_mapping_entry(struct xa_stat    
574                 struct address_space *mapping,    
575 {                                                 
576         unsigned long index = xas->xa_index;      
577         bool pmd_downgrade;     /* splitting P    
578         void *entry;                              
579                                                   
580 retry:                                            
581         pmd_downgrade = false;                    
582         xas_lock_irq(xas);                        
583         entry = get_unlocked_entry(xas, order)    
584                                                   
585         if (entry) {                              
586                 if (dax_is_conflict(entry))       
587                         goto fallback;            
588                 if (!xa_is_value(entry)) {        
589                         xas_set_err(xas, -EIO)    
590                         goto out_unlock;          
591                 }                                 
592                                                   
593                 if (order == 0) {                 
594                         if (dax_is_pmd_entry(e    
595                             (dax_is_zero_entry    
596                              dax_is_empty_entr    
597                                 pmd_downgrade     
598                         }                         
599                 }                                 
600         }                                         
601                                                   
602         if (pmd_downgrade) {                      
603                 /*                                
604                  * Make sure 'entry' remains v    
605                  * the i_pages lock.              
606                  */                               
607                 dax_lock_entry(xas, entry);       
608                                                   
609                 /*                                
610                  * Besides huge zero pages the    
611                  * downgraded are empty entrie    
612                  * unmapped.                      
613                  */                               
614                 if (dax_is_zero_entry(entry))     
615                         xas_unlock_irq(xas);      
616                         unmap_mapping_pages(ma    
617                                         xas->x    
618                                         PG_PMD    
619                         xas_reset(xas);           
620                         xas_lock_irq(xas);        
621                 }                                 
622                                                   
623                 dax_disassociate_entry(entry,     
624                 xas_store(xas, NULL);   /* und    
625                 dax_wake_entry(xas, entry, WAK    
626                 mapping->nrpages -= PG_PMD_NR;    
627                 entry = NULL;                     
628                 xas_set(xas, index);              
629         }                                         
630                                                   
631         if (entry) {                              
632                 dax_lock_entry(xas, entry);       
633         } else {                                  
634                 unsigned long flags = DAX_EMPT    
635                                                   
636                 if (order > 0)                    
637                         flags |= DAX_PMD;         
638                 entry = dax_make_entry(pfn_to_    
639                 dax_lock_entry(xas, entry);       
640                 if (xas_error(xas))               
641                         goto out_unlock;          
642                 mapping->nrpages += 1UL << ord    
643         }                                         
644                                                   
645 out_unlock:                                       
646         xas_unlock_irq(xas);                      
647         if (xas_nomem(xas, mapping_gfp_mask(ma    
648                 goto retry;                       
649         if (xas->xa_node == XA_ERROR(-ENOMEM))    
650                 return xa_mk_internal(VM_FAULT    
651         if (xas_error(xas))                       
652                 return xa_mk_internal(VM_FAULT    
653         return entry;                             
654 fallback:                                         
655         xas_unlock_irq(xas);                      
656         return xa_mk_internal(VM_FAULT_FALLBAC    
657 }                                                 
658                                                   
659 /**                                               
660  * dax_layout_busy_page_range - find first pin    
661  * @mapping: address space to scan for a page     
662  * @start: Starting offset. Page containing 's    
663  * @end: End offset. Page containing 'end' is     
664  *       pages from 'start' till the end of fi    
665  *                                                
666  * DAX requires ZONE_DEVICE mapped pages. Thes    
667  * 'onlined' to the page allocator so they are    
668  * page->count == 1. A filesystem uses this in    
669  * any page in the mapping is busy, i.e. for D    
670  * get_user_pages() usages.                       
671  *                                                
672  * It is expected that the filesystem is holdi    
673  * establishment of new mappings in this addre    
674  * to be able to run unmap_mapping_range() and    
675  * mapping_mapped() becoming true.                
676  */                                               
677 struct page *dax_layout_busy_page_range(struct    
678                                         loff_t    
679 {                                                 
680         void *entry;                              
681         unsigned int scanned = 0;                 
682         struct page *page = NULL;                 
683         pgoff_t start_idx = start >> PAGE_SHIF    
684         pgoff_t end_idx;                          
685         XA_STATE(xas, &mapping->i_pages, start    
686                                                   
687         /*                                        
688          * In the 'limited' case get_user_page    
689          */                                       
690         if (IS_ENABLED(CONFIG_FS_DAX_LIMITED))    
691                 return NULL;                      
692                                                   
693         if (!dax_mapping(mapping) || !mapping_    
694                 return NULL;                      
695                                                   
696         /* If end == LLONG_MAX, all pages from    
697         if (end == LLONG_MAX)                     
698                 end_idx = ULONG_MAX;              
699         else                                      
700                 end_idx = end >> PAGE_SHIFT;      
701         /*                                        
702          * If we race get_user_pages_fast() he    
703          * elevated page count in the iteratio    
704          * get_user_pages_fast() will see that    
705          * against is no longer mapped in the     
706          * get_user_pages() slow path.  The sl    
707          * pte_lock() and pmd_lock(). New refe    
708          * holding those locks, and unmap_mapp    
709          * pte or pmd without holding the resp    
710          * guaranteed to either see new refere    
711          * references from being established.     
712          */                                       
713         unmap_mapping_pages(mapping, start_idx    
714                                                   
715         xas_lock_irq(&xas);                       
716         xas_for_each(&xas, entry, end_idx) {      
717                 if (WARN_ON_ONCE(!xa_is_value(    
718                         continue;                 
719                 if (unlikely(dax_is_locked(ent    
720                         entry = get_unlocked_e    
721                 if (entry)                        
722                         page = dax_busy_page(e    
723                 put_unlocked_entry(&xas, entry    
724                 if (page)                         
725                         break;                    
726                 if (++scanned % XA_CHECK_SCHED    
727                         continue;                 
728                                                   
729                 xas_pause(&xas);                  
730                 xas_unlock_irq(&xas);             
731                 cond_resched();                   
732                 xas_lock_irq(&xas);               
733         }                                         
734         xas_unlock_irq(&xas);                     
735         return page;                              
736 }                                                 
737 EXPORT_SYMBOL_GPL(dax_layout_busy_page_range);    
738                                                   
739 struct page *dax_layout_busy_page(struct addre    
740 {                                                 
741         return dax_layout_busy_page_range(mapp    
742 }                                                 
743 EXPORT_SYMBOL_GPL(dax_layout_busy_page);          
744                                                   
745 static int __dax_invalidate_entry(struct addre    
746                                           pgof    
747 {                                                 
748         XA_STATE(xas, &mapping->i_pages, index    
749         int ret = 0;                              
750         void *entry;                              
751                                                   
752         xas_lock_irq(&xas);                       
753         entry = get_unlocked_entry(&xas, 0);      
754         if (!entry || WARN_ON_ONCE(!xa_is_valu    
755                 goto out;                         
756         if (!trunc &&                             
757             (xas_get_mark(&xas, PAGECACHE_TAG_    
758              xas_get_mark(&xas, PAGECACHE_TAG_    
759                 goto out;                         
760         dax_disassociate_entry(entry, mapping,    
761         xas_store(&xas, NULL);                    
762         mapping->nrpages -= 1UL << dax_entry_o    
763         ret = 1;                                  
764 out:                                              
765         put_unlocked_entry(&xas, entry, WAKE_A    
766         xas_unlock_irq(&xas);                     
767         return ret;                               
768 }                                                 
769                                                   
770 static int __dax_clear_dirty_range(struct addr    
771                 pgoff_t start, pgoff_t end)       
772 {                                                 
773         XA_STATE(xas, &mapping->i_pages, start    
774         unsigned int scanned = 0;                 
775         void *entry;                              
776                                                   
777         xas_lock_irq(&xas);                       
778         xas_for_each(&xas, entry, end) {          
779                 entry = get_unlocked_entry(&xa    
780                 xas_clear_mark(&xas, PAGECACHE    
781                 xas_clear_mark(&xas, PAGECACHE    
782                 put_unlocked_entry(&xas, entry    
783                                                   
784                 if (++scanned % XA_CHECK_SCHED    
785                         continue;                 
786                                                   
787                 xas_pause(&xas);                  
788                 xas_unlock_irq(&xas);             
789                 cond_resched();                   
790                 xas_lock_irq(&xas);               
791         }                                         
792         xas_unlock_irq(&xas);                     
793                                                   
794         return 0;                                 
795 }                                                 
796                                                   
797 /*                                                
798  * Delete DAX entry at @index from @mapping.      
799  * to be unlocked before deleting it.             
800  */                                               
801 int dax_delete_mapping_entry(struct address_sp    
802 {                                                 
803         int ret = __dax_invalidate_entry(mappi    
804                                                   
805         /*                                        
806          * This gets called from truncate / pu    
807          * must hold locks protecting against     
808          * page cache (usually fs-private i_mm    
809          * caller has seen a DAX entry for thi    
810          * at that index as well...               
811          */                                       
812         WARN_ON_ONCE(!ret);                       
813         return ret;                               
814 }                                                 
815                                                   
816 /*                                                
817  * Invalidate DAX entry if it is clean.           
818  */                                               
819 int dax_invalidate_mapping_entry_sync(struct a    
820                                       pgoff_t     
821 {                                                 
822         return __dax_invalidate_entry(mapping,    
823 }                                                 
824                                                   
825 static pgoff_t dax_iomap_pgoff(const struct io    
826 {                                                 
827         return PHYS_PFN(iomap->addr + (pos & P    
828 }                                                 
829                                                   
830 static int copy_cow_page_dax(struct vm_fault *    
831 {                                                 
832         pgoff_t pgoff = dax_iomap_pgoff(&iter-    
833         void *vto, *kaddr;                        
834         long rc;                                  
835         int id;                                   
836                                                   
837         id = dax_read_lock();                     
838         rc = dax_direct_access(iter->iomap.dax    
839                                 &kaddr, NULL);    
840         if (rc < 0) {                             
841                 dax_read_unlock(id);              
842                 return rc;                        
843         }                                         
844         vto = kmap_atomic(vmf->cow_page);         
845         copy_user_page(vto, kaddr, vmf->addres    
846         kunmap_atomic(vto);                       
847         dax_read_unlock(id);                      
848         return 0;                                 
849 }                                                 
850                                                   
851 /*                                                
852  * MAP_SYNC on a dax mapping guarantees dirty     
853  * flushed on write-faults (non-cow), but not     
854  */                                               
855 static bool dax_fault_is_synchronous(const str    
856                 struct vm_area_struct *vma)       
857 {                                                 
858         return (iter->flags & IOMAP_WRITE) &&     
859                 (iter->iomap.flags & IOMAP_F_D    
860 }                                                 
861                                                   
862 /*                                                
863  * By this point grab_mapping_entry() has ensu    
864  * of the appropriate size so we don't have to    
865  * PTEs.  If we happen to be trying to insert     
866  * already in the tree, we will skip the inser    
867  * appropriate.                                   
868  */                                               
869 static void *dax_insert_entry(struct xa_state     
870                 const struct iomap_iter *iter,    
871                 unsigned long flags)              
872 {                                                 
873         struct address_space *mapping = vmf->v    
874         void *new_entry = dax_make_entry(pfn,     
875         bool write = iter->flags & IOMAP_WRITE    
876         bool dirty = write && !dax_fault_is_sy    
877         bool shared = iter->iomap.flags & IOMA    
878                                                   
879         if (dirty)                                
880                 __mark_inode_dirty(mapping->ho    
881                                                   
882         if (shared || (dax_is_zero_entry(entry    
883                 unsigned long index = xas->xa_    
884                 /* we are replacing a zero pag    
885                 if (dax_is_pmd_entry(entry))      
886                         unmap_mapping_pages(ma    
887                                         PG_PMD    
888                 else /* pte entry */              
889                         unmap_mapping_pages(ma    
890         }                                         
891                                                   
892         xas_reset(xas);                           
893         xas_lock_irq(xas);                        
894         if (shared || dax_is_zero_entry(entry)    
895                 void *old;                        
896                                                   
897                 dax_disassociate_entry(entry,     
898                 dax_associate_entry(new_entry,    
899                                 shared);          
900                 /*                                
901                  * Only swap our new entry int    
902                  * entry is a zero page or an     
903                  * PMD entry is already in the    
904                  * means that if we are trying    
905                  * existing entry is a PMD, we    
906                  * tree and dirty it if necess    
907                  */                               
908                 old = dax_lock_entry(xas, new_    
909                 WARN_ON_ONCE(old != xa_mk_valu    
910                                         DAX_LO    
911                 entry = new_entry;                
912         } else {                                  
913                 xas_load(xas);  /* Walk the xa    
914         }                                         
915                                                   
916         if (dirty)                                
917                 xas_set_mark(xas, PAGECACHE_TA    
918                                                   
919         if (write && shared)                      
920                 xas_set_mark(xas, PAGECACHE_TA    
921                                                   
922         xas_unlock_irq(xas);                      
923         return entry;                             
924 }                                                 
925                                                   
926 static int dax_writeback_one(struct xa_state *    
927                 struct address_space *mapping,    
928 {                                                 
929         unsigned long pfn, index, count, end;     
930         long ret = 0;                             
931         struct vm_area_struct *vma;               
932                                                   
933         /*                                        
934          * A page got tagged dirty in DAX mapp    
935          * wrong.                                 
936          */                                       
937         if (WARN_ON(!xa_is_value(entry)))         
938                 return -EIO;                      
939                                                   
940         if (unlikely(dax_is_locked(entry))) {     
941                 void *old_entry = entry;          
942                                                   
943                 entry = get_unlocked_entry(xas    
944                                                   
945                 /* Entry got punched out / rea    
946                 if (!entry || WARN_ON_ONCE(!xa    
947                         goto put_unlocked;        
948                 /*                                
949                  * Entry got reallocated elsew    
950                  * We have to compare pfns as     
951                  * difference in lockbit or en    
952                  */                               
953                 if (dax_to_pfn(old_entry) != d    
954                         goto put_unlocked;        
955                 if (WARN_ON_ONCE(dax_is_empty_    
956                                         dax_is    
957                         ret = -EIO;               
958                         goto put_unlocked;        
959                 }                                 
960                                                   
961                 /* Another fsync thread may ha    
962                 if (!xas_get_mark(xas, PAGECAC    
963                         goto put_unlocked;        
964         }                                         
965                                                   
966         /* Lock the entry to serialize with pa    
967         dax_lock_entry(xas, entry);               
968                                                   
969         /*                                        
970          * We can clear the tag now but we hav    
971          * dax_writeback_one() calls for the s    
972          * actually flush the caches. This is     
973          * at the entry only under the i_pages    
974          * they will see the entry locked and     
975          */                                       
976         xas_clear_mark(xas, PAGECACHE_TAG_TOWR    
977         xas_unlock_irq(xas);                      
978                                                   
979         /*                                        
980          * If dax_writeback_mapping_range() wa    
981          * in the middle of a PMD, the 'index'    
982          * aligned to the start of the PMD.       
983          * This allows us to flush for PMD_SIZ    
984          * partial PMD writebacks.                
985          */                                       
986         pfn = dax_to_pfn(entry);                  
987         count = 1UL << dax_entry_order(entry);    
988         index = xas->xa_index & ~(count - 1);     
989         end = index + count - 1;                  
990                                                   
991         /* Walk all mappings of a given index     
992         i_mmap_lock_read(mapping);                
993         vma_interval_tree_foreach(vma, &mappin    
994                 pfn_mkclean_range(pfn, count,     
995                 cond_resched();                   
996         }                                         
997         i_mmap_unlock_read(mapping);              
998                                                   
999         dax_flush(dax_dev, page_address(pfn_to    
1000         /*                                       
1001          * After we have flushed the cache, w    
1002          * cannot be new dirty data in the pf    
1003          * the pfn mappings are writeprotecte    
1004          * entry lock.                           
1005          */                                      
1006         xas_reset(xas);                          
1007         xas_lock_irq(xas);                       
1008         xas_store(xas, entry);                   
1009         xas_clear_mark(xas, PAGECACHE_TAG_DIR    
1010         dax_wake_entry(xas, entry, WAKE_NEXT)    
1011                                                  
1012         trace_dax_writeback_one(mapping->host    
1013         return ret;                              
1014                                                  
1015  put_unlocked:                                   
1016         put_unlocked_entry(xas, entry, WAKE_N    
1017         return ret;                              
1018 }                                                
1019                                                  
1020 /*                                               
1021  * Flush the mapping to the persistent domain    
1022  * end]. This is required by data integrity o    
1023  * on persistent storage prior to completion     
1024  */                                              
1025 int dax_writeback_mapping_range(struct addres    
1026                 struct dax_device *dax_dev, s    
1027 {                                                
1028         XA_STATE(xas, &mapping->i_pages, wbc-    
1029         struct inode *inode = mapping->host;     
1030         pgoff_t end_index = wbc->range_end >>    
1031         void *entry;                             
1032         int ret = 0;                             
1033         unsigned int scanned = 0;                
1034                                                  
1035         if (WARN_ON_ONCE(inode->i_blkbits !=     
1036                 return -EIO;                     
1037                                                  
1038         if (mapping_empty(mapping) || wbc->sy    
1039                 return 0;                        
1040                                                  
1041         trace_dax_writeback_range(inode, xas.    
1042                                                  
1043         tag_pages_for_writeback(mapping, xas.    
1044                                                  
1045         xas_lock_irq(&xas);                      
1046         xas_for_each_marked(&xas, entry, end_    
1047                 ret = dax_writeback_one(&xas,    
1048                 if (ret < 0) {                   
1049                         mapping_set_error(map    
1050                         break;                   
1051                 }                                
1052                 if (++scanned % XA_CHECK_SCHE    
1053                         continue;                
1054                                                  
1055                 xas_pause(&xas);                 
1056                 xas_unlock_irq(&xas);            
1057                 cond_resched();                  
1058                 xas_lock_irq(&xas);              
1059         }                                        
1060         xas_unlock_irq(&xas);                    
1061         trace_dax_writeback_range_done(inode,    
1062         return ret;                              
1063 }                                                
1064 EXPORT_SYMBOL_GPL(dax_writeback_mapping_range    
1065                                                  
1066 static int dax_iomap_direct_access(const stru    
1067                 size_t size, void **kaddr, pf    
1068 {                                                
1069         pgoff_t pgoff = dax_iomap_pgoff(iomap    
1070         int id, rc = 0;                          
1071         long length;                             
1072                                                  
1073         id = dax_read_lock();                    
1074         length = dax_direct_access(iomap->dax    
1075                                    DAX_ACCESS    
1076         if (length < 0) {                        
1077                 rc = length;                     
1078                 goto out;                        
1079         }                                        
1080         if (!pfnp)                               
1081                 goto out_check_addr;             
1082         rc = -EINVAL;                            
1083         if (PFN_PHYS(length) < size)             
1084                 goto out;                        
1085         if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(s    
1086                 goto out;                        
1087         /* For larger pages we need devmap */    
1088         if (length > 1 && !pfn_t_devmap(*pfnp    
1089                 goto out;                        
1090         rc = 0;                                  
1091                                                  
1092 out_check_addr:                                  
1093         if (!kaddr)                              
1094                 goto out;                        
1095         if (!*kaddr)                             
1096                 rc = -EFAULT;                    
1097 out:                                             
1098         dax_read_unlock(id);                     
1099         return rc;                               
1100 }                                                
1101                                                  
1102 /**                                              
1103  * dax_iomap_copy_around - Prepare for an una    
1104  * by copying the data before and after the r    
1105  * @pos:        address to do copy from.         
1106  * @length:     size of copy operation.          
1107  * @align_size: aligned w.r.t align_size (eit    
1108  * @srcmap:     iomap srcmap                     
1109  * @daddr:      destination address to copy t    
1110  *                                               
1111  * This can be called from two places. Either    
1112  * aligned), to copy the length size data to     
1113  * write operation, dax_iomap_iter() might ca    
1114  * start or end unaligned address. In the lat    
1115  * aligned ranges is taken care by dax_iomap_    
1116  * If the srcmap contains invalid data, such     
1117  * area to make sure no old data remains.        
1118  */                                              
1119 static int dax_iomap_copy_around(loff_t pos,     
1120                 const struct iomap *srcmap, v    
1121 {                                                
1122         loff_t head_off = pos & (align_size -    
1123         size_t size = ALIGN(head_off + length    
1124         loff_t end = pos + length;               
1125         loff_t pg_end = round_up(end, align_s    
1126         /* copy_all is usually in page fault     
1127         bool copy_all = head_off == 0 && end     
1128         /* zero the edges if srcmap is a HOLE    
1129         bool zero_edge = srcmap->flags & IOMA    
1130                          srcmap->type == IOMA    
1131         void *saddr = NULL;                      
1132         int ret = 0;                             
1133                                                  
1134         if (!zero_edge) {                        
1135                 ret = dax_iomap_direct_access    
1136                 if (ret)                         
1137                         return dax_mem2blk_er    
1138         }                                        
1139                                                  
1140         if (copy_all) {                          
1141                 if (zero_edge)                   
1142                         memset(daddr, 0, size    
1143                 else                             
1144                         ret = copy_mc_to_kern    
1145                 goto out;                        
1146         }                                        
1147                                                  
1148         /* Copy the head part of the range */    
1149         if (head_off) {                          
1150                 if (zero_edge)                   
1151                         memset(daddr, 0, head    
1152                 else {                           
1153                         ret = copy_mc_to_kern    
1154                         if (ret)                 
1155                                 return -EIO;     
1156                 }                                
1157         }                                        
1158                                                  
1159         /* Copy the tail part of the range */    
1160         if (end < pg_end) {                      
1161                 loff_t tail_off = head_off +     
1162                 loff_t tail_len = pg_end - en    
1163                                                  
1164                 if (zero_edge)                   
1165                         memset(daddr + tail_o    
1166                 else {                           
1167                         ret = copy_mc_to_kern    
1168                                                  
1169                         if (ret)                 
1170                                 return -EIO;     
1171                 }                                
1172         }                                        
1173 out:                                             
1174         if (zero_edge)                           
1175                 dax_flush(srcmap->dax_dev, da    
1176         return ret ? -EIO : 0;                   
1177 }                                                
1178                                                  
1179 /*                                               
1180  * The user has performed a load from a hole     
1181  * page in the file would cause excessive sto    
1182  * sparse files.  Instead we insert a read-on    
1183  * If this page is ever written to we will re    
1184  * point to real DAX storage instead.            
1185  */                                              
1186 static vm_fault_t dax_load_hole(struct xa_sta    
1187                 const struct iomap_iter *iter    
1188 {                                                
1189         struct inode *inode = iter->inode;       
1190         unsigned long vaddr = vmf->address;      
1191         pfn_t pfn = pfn_to_pfn_t(my_zero_pfn(    
1192         vm_fault_t ret;                          
1193                                                  
1194         *entry = dax_insert_entry(xas, vmf, i    
1195                                                  
1196         ret = vmf_insert_mixed(vmf->vma, vadd    
1197         trace_dax_load_hole(inode, vmf, ret);    
1198         return ret;                              
1199 }                                                
1200                                                  
1201 #ifdef CONFIG_FS_DAX_PMD                         
1202 static vm_fault_t dax_pmd_load_hole(struct xa    
1203                 const struct iomap_iter *iter    
1204 {                                                
1205         struct address_space *mapping = vmf->    
1206         unsigned long pmd_addr = vmf->address    
1207         struct vm_area_struct *vma = vmf->vma    
1208         struct inode *inode = mapping->host;     
1209         pgtable_t pgtable = NULL;                
1210         struct folio *zero_folio;                
1211         spinlock_t *ptl;                         
1212         pmd_t pmd_entry;                         
1213         pfn_t pfn;                               
1214                                                  
1215         zero_folio = mm_get_huge_zero_folio(v    
1216                                                  
1217         if (unlikely(!zero_folio))               
1218                 goto fallback;                   
1219                                                  
1220         pfn = page_to_pfn_t(&zero_folio->page    
1221         *entry = dax_insert_entry(xas, vmf, i    
1222                                   DAX_PMD | D    
1223                                                  
1224         if (arch_needs_pgtable_deposit()) {      
1225                 pgtable = pte_alloc_one(vma->    
1226                 if (!pgtable)                    
1227                         return VM_FAULT_OOM;     
1228         }                                        
1229                                                  
1230         ptl = pmd_lock(vmf->vma->vm_mm, vmf->    
1231         if (!pmd_none(*(vmf->pmd))) {            
1232                 spin_unlock(ptl);                
1233                 goto fallback;                   
1234         }                                        
1235                                                  
1236         if (pgtable) {                           
1237                 pgtable_trans_huge_deposit(vm    
1238                 mm_inc_nr_ptes(vma->vm_mm);      
1239         }                                        
1240         pmd_entry = mk_pmd(&zero_folio->page,    
1241         pmd_entry = pmd_mkhuge(pmd_entry);       
1242         set_pmd_at(vmf->vma->vm_mm, pmd_addr,    
1243         spin_unlock(ptl);                        
1244         trace_dax_pmd_load_hole(inode, vmf, z    
1245         return VM_FAULT_NOPAGE;                  
1246                                                  
1247 fallback:                                        
1248         if (pgtable)                             
1249                 pte_free(vma->vm_mm, pgtable)    
1250         trace_dax_pmd_load_hole_fallback(inod    
1251         return VM_FAULT_FALLBACK;                
1252 }                                                
1253 #else                                            
1254 static vm_fault_t dax_pmd_load_hole(struct xa    
1255                 const struct iomap_iter *iter    
1256 {                                                
1257         return VM_FAULT_FALLBACK;                
1258 }                                                
1259 #endif /* CONFIG_FS_DAX_PMD */                   
1260                                                  
1261 static s64 dax_unshare_iter(struct iomap_iter    
1262 {                                                
1263         struct iomap *iomap = &iter->iomap;      
1264         const struct iomap *srcmap = iomap_it    
1265         loff_t copy_pos = iter->pos;             
1266         u64 copy_len = iomap_length(iter);       
1267         u32 mod;                                 
1268         int id = 0;                              
1269         s64 ret = 0;                             
1270         void *daddr = NULL, *saddr = NULL;       
1271                                                  
1272         if (!iomap_want_unshare_iter(iter))      
1273                 return iomap_length(iter);       
1274                                                  
1275         /*                                       
1276          * Extend the file range to be aligne    
1277          * we need to copy entire blocks, not    
1278          * Invalidate the mapping because we'    
1279          */                                      
1280         mod = offset_in_page(copy_pos);          
1281         if (mod) {                               
1282                 copy_len += mod;                 
1283                 copy_pos -= mod;                 
1284         }                                        
1285                                                  
1286         mod = offset_in_page(copy_pos + copy_    
1287         if (mod)                                 
1288                 copy_len += PAGE_SIZE - mod;     
1289                                                  
1290         invalidate_inode_pages2_range(iter->i    
1291                                       copy_po    
1292                                       (copy_p    
1293                                                  
1294         id = dax_read_lock();                    
1295         ret = dax_iomap_direct_access(iomap,     
1296         if (ret < 0)                             
1297                 goto out_unlock;                 
1298                                                  
1299         ret = dax_iomap_direct_access(srcmap,    
1300         if (ret < 0)                             
1301                 goto out_unlock;                 
1302                                                  
1303         if (copy_mc_to_kernel(daddr, saddr, c    
1304                 ret = iomap_length(iter);        
1305         else                                     
1306                 ret = -EIO;                      
1307                                                  
1308 out_unlock:                                      
1309         dax_read_unlock(id);                     
1310         return dax_mem2blk_err(ret);             
1311 }                                                
1312                                                  
1313 int dax_file_unshare(struct inode *inode, lof    
1314                 const struct iomap_ops *ops)     
1315 {                                                
1316         struct iomap_iter iter = {               
1317                 .inode          = inode,         
1318                 .pos            = pos,           
1319                 .flags          = IOMAP_WRITE    
1320         };                                       
1321         loff_t size = i_size_read(inode);        
1322         int ret;                                 
1323                                                  
1324         if (pos < 0 || pos >= size)              
1325                 return 0;                        
1326                                                  
1327         iter.len = min(len, size - pos);         
1328         while ((ret = iomap_iter(&iter, ops))    
1329                 iter.processed = dax_unshare_    
1330         return ret;                              
1331 }                                                
1332 EXPORT_SYMBOL_GPL(dax_file_unshare);             
1333                                                  
1334 static int dax_memzero(struct iomap_iter *ite    
1335 {                                                
1336         const struct iomap *iomap = &iter->io    
1337         const struct iomap *srcmap = iomap_it    
1338         unsigned offset = offset_in_page(pos)    
1339         pgoff_t pgoff = dax_iomap_pgoff(iomap    
1340         void *kaddr;                             
1341         long ret;                                
1342                                                  
1343         ret = dax_direct_access(iomap->dax_de    
1344                                 NULL);           
1345         if (ret < 0)                             
1346                 return dax_mem2blk_err(ret);     
1347                                                  
1348         memset(kaddr + offset, 0, size);         
1349         if (iomap->flags & IOMAP_F_SHARED)       
1350                 ret = dax_iomap_copy_around(p    
1351                                             k    
1352         else                                     
1353                 dax_flush(iomap->dax_dev, kad    
1354         return ret;                              
1355 }                                                
1356                                                  
1357 static s64 dax_zero_iter(struct iomap_iter *i    
1358 {                                                
1359         const struct iomap *iomap = &iter->io    
1360         const struct iomap *srcmap = iomap_it    
1361         loff_t pos = iter->pos;                  
1362         u64 length = iomap_length(iter);         
1363         s64 written = 0;                         
1364                                                  
1365         /* already zeroed?  we're done. */       
1366         if (srcmap->type == IOMAP_HOLE || src    
1367                 return length;                   
1368                                                  
1369         /*                                       
1370          * invalidate the pages whose sharing    
1371          * because of CoW.                       
1372          */                                      
1373         if (iomap->flags & IOMAP_F_SHARED)       
1374                 invalidate_inode_pages2_range    
1375                                                  
1376                                                  
1377                                                  
1378         do {                                     
1379                 unsigned offset = offset_in_p    
1380                 unsigned size = min_t(u64, PA    
1381                 pgoff_t pgoff = dax_iomap_pgo    
1382                 long rc;                         
1383                 int id;                          
1384                                                  
1385                 id = dax_read_lock();            
1386                 if (IS_ALIGNED(pos, PAGE_SIZE    
1387                         rc = dax_zero_page_ra    
1388                 else                             
1389                         rc = dax_memzero(iter    
1390                 dax_read_unlock(id);             
1391                                                  
1392                 if (rc < 0)                      
1393                         return rc;               
1394                 pos += size;                     
1395                 length -= size;                  
1396                 written += size;                 
1397         } while (length > 0);                    
1398                                                  
1399         if (did_zero)                            
1400                 *did_zero = true;                
1401         return written;                          
1402 }                                                
1403                                                  
1404 int dax_zero_range(struct inode *inode, loff_    
1405                 const struct iomap_ops *ops)     
1406 {                                                
1407         struct iomap_iter iter = {               
1408                 .inode          = inode,         
1409                 .pos            = pos,           
1410                 .len            = len,           
1411                 .flags          = IOMAP_DAX |    
1412         };                                       
1413         int ret;                                 
1414                                                  
1415         while ((ret = iomap_iter(&iter, ops))    
1416                 iter.processed = dax_zero_ite    
1417         return ret;                              
1418 }                                                
1419 EXPORT_SYMBOL_GPL(dax_zero_range);               
1420                                                  
1421 int dax_truncate_page(struct inode *inode, lo    
1422                 const struct iomap_ops *ops)     
1423 {                                                
1424         unsigned int blocksize = i_blocksize(    
1425         unsigned int off = pos & (blocksize -    
1426                                                  
1427         /* Block boundary? Nothing to do */      
1428         if (!off)                                
1429                 return 0;                        
1430         return dax_zero_range(inode, pos, blo    
1431 }                                                
1432 EXPORT_SYMBOL_GPL(dax_truncate_page);            
1433                                                  
1434 static loff_t dax_iomap_iter(const struct iom    
1435                 struct iov_iter *iter)           
1436 {                                                
1437         const struct iomap *iomap = &iomi->io    
1438         const struct iomap *srcmap = iomap_it    
1439         loff_t length = iomap_length(iomi);      
1440         loff_t pos = iomi->pos;                  
1441         struct dax_device *dax_dev = iomap->d    
1442         loff_t end = pos + length, done = 0;     
1443         bool write = iov_iter_rw(iter) == WRI    
1444         bool cow = write && iomap->flags & IO    
1445         ssize_t ret = 0;                         
1446         size_t xfer;                             
1447         int id;                                  
1448                                                  
1449         if (!write) {                            
1450                 end = min(end, i_size_read(io    
1451                 if (pos >= end)                  
1452                         return 0;                
1453                                                  
1454                 if (iomap->type == IOMAP_HOLE    
1455                         return iov_iter_zero(    
1456         }                                        
1457                                                  
1458         /*                                       
1459          * In DAX mode, enforce either pure o    
1460          * writes to unwritten extents as par    
1461          */                                      
1462         if (WARN_ON_ONCE(iomap->type != IOMAP    
1463                         !(iomap->flags & IOMA    
1464                 return -EIO;                     
1465                                                  
1466         /*                                       
1467          * Write can allocate block for an ar    
1468          * into page tables. We have to tear     
1469          * written by write(2) is visible in     
1470          */                                      
1471         if (iomap->flags & IOMAP_F_NEW || cow    
1472                 /*                               
1473                  * Filesystem allows CoW on n    
1474                  * may have been mmapped with    
1475                  * invalidate its dax entries    
1476                  * in advance.                   
1477                  */                              
1478                 if (cow)                         
1479                         __dax_clear_dirty_ran    
1480                                                  
1481                                                  
1482                 invalidate_inode_pages2_range    
1483                                                  
1484                                                  
1485         }                                        
1486                                                  
1487         id = dax_read_lock();                    
1488         while (pos < end) {                      
1489                 unsigned offset = pos & (PAGE    
1490                 const size_t size = ALIGN(len    
1491                 pgoff_t pgoff = dax_iomap_pgo    
1492                 ssize_t map_len;                 
1493                 bool recovery = false;           
1494                 void *kaddr;                     
1495                                                  
1496                 if (fatal_signal_pending(curr    
1497                         ret = -EINTR;            
1498                         break;                   
1499                 }                                
1500                                                  
1501                 map_len = dax_direct_access(d    
1502                                 DAX_ACCESS, &    
1503                 if (map_len == -EHWPOISON &&     
1504                         map_len = dax_direct_    
1505                                         PHYS_    
1506                                         &kadd    
1507                         if (map_len > 0)         
1508                                 recovery = tr    
1509                 }                                
1510                 if (map_len < 0) {               
1511                         ret = dax_mem2blk_err    
1512                         break;                   
1513                 }                                
1514                                                  
1515                 if (cow) {                       
1516                         ret = dax_iomap_copy_    
1517                                                  
1518                         if (ret)                 
1519                                 break;           
1520                 }                                
1521                                                  
1522                 map_len = PFN_PHYS(map_len);     
1523                 kaddr += offset;                 
1524                 map_len -= offset;               
1525                 if (map_len > end - pos)         
1526                         map_len = end - pos;     
1527                                                  
1528                 if (recovery)                    
1529                         xfer = dax_recovery_w    
1530                                         map_l    
1531                 else if (write)                  
1532                         xfer = dax_copy_from_    
1533                                         map_l    
1534                 else                             
1535                         xfer = dax_copy_to_it    
1536                                         map_l    
1537                                                  
1538                 pos += xfer;                     
1539                 length -= xfer;                  
1540                 done += xfer;                    
1541                                                  
1542                 if (xfer == 0)                   
1543                         ret = -EFAULT;           
1544                 if (xfer < map_len)              
1545                         break;                   
1546         }                                        
1547         dax_read_unlock(id);                     
1548                                                  
1549         return done ? done : ret;                
1550 }                                                
1551                                                  
1552 /**                                              
1553  * dax_iomap_rw - Perform I/O to a DAX file      
1554  * @iocb:       The control block for this I/    
1555  * @iter:       The addresses to do I/O from     
1556  * @ops:        iomap ops passed from the fil    
1557  *                                               
1558  * This function performs read and write oper    
1559  * persistent memory.  The callers needs to t    
1560  * and evicting any page cache pages in the r    
1561  */                                              
1562 ssize_t                                          
1563 dax_iomap_rw(struct kiocb *iocb, struct iov_i    
1564                 const struct iomap_ops *ops)     
1565 {                                                
1566         struct iomap_iter iomi = {               
1567                 .inode          = iocb->ki_fi    
1568                 .pos            = iocb->ki_po    
1569                 .len            = iov_iter_co    
1570                 .flags          = IOMAP_DAX,     
1571         };                                       
1572         loff_t done = 0;                         
1573         int ret;                                 
1574                                                  
1575         if (!iomi.len)                           
1576                 return 0;                        
1577                                                  
1578         if (iov_iter_rw(iter) == WRITE) {        
1579                 lockdep_assert_held_write(&io    
1580                 iomi.flags |= IOMAP_WRITE;       
1581         } else {                                 
1582                 lockdep_assert_held(&iomi.ino    
1583         }                                        
1584                                                  
1585         if (iocb->ki_flags & IOCB_NOWAIT)        
1586                 iomi.flags |= IOMAP_NOWAIT;      
1587                                                  
1588         while ((ret = iomap_iter(&iomi, ops))    
1589                 iomi.processed = dax_iomap_it    
1590                                                  
1591         done = iomi.pos - iocb->ki_pos;          
1592         iocb->ki_pos = iomi.pos;                 
1593         return done ? done : ret;                
1594 }                                                
1595 EXPORT_SYMBOL_GPL(dax_iomap_rw);                 
1596                                                  
1597 static vm_fault_t dax_fault_return(int error)    
1598 {                                                
1599         if (error == 0)                          
1600                 return VM_FAULT_NOPAGE;          
1601         return vmf_error(error);                 
1602 }                                                
1603                                                  
1604 /*                                               
1605  * When handling a synchronous page fault and    
1606  * insert the PTE/PMD into page tables only a    
1607  * insertion for now and return the pfn so th    
1608  * fsync is done.                                
1609  */                                              
1610 static vm_fault_t dax_fault_synchronous_pfnp(    
1611 {                                                
1612         if (WARN_ON_ONCE(!pfnp))                 
1613                 return VM_FAULT_SIGBUS;          
1614         *pfnp = pfn;                             
1615         return VM_FAULT_NEEDDSYNC;               
1616 }                                                
1617                                                  
1618 static vm_fault_t dax_fault_cow_page(struct v    
1619                 const struct iomap_iter *iter    
1620 {                                                
1621         vm_fault_t ret;                          
1622         int error = 0;                           
1623                                                  
1624         switch (iter->iomap.type) {              
1625         case IOMAP_HOLE:                         
1626         case IOMAP_UNWRITTEN:                    
1627                 clear_user_highpage(vmf->cow_    
1628                 break;                           
1629         case IOMAP_MAPPED:                       
1630                 error = copy_cow_page_dax(vmf    
1631                 break;                           
1632         default:                                 
1633                 WARN_ON_ONCE(1);                 
1634                 error = -EIO;                    
1635                 break;                           
1636         }                                        
1637                                                  
1638         if (error)                               
1639                 return dax_fault_return(error    
1640                                                  
1641         __SetPageUptodate(vmf->cow_page);        
1642         ret = finish_fault(vmf);                 
1643         if (!ret)                                
1644                 return VM_FAULT_DONE_COW;        
1645         return ret;                              
1646 }                                                
1647                                                  
1648 /**                                              
1649  * dax_fault_iter - Common actor to handle pf    
1650  * @vmf:        vm fault instance                
1651  * @iter:       iomap iter                       
1652  * @pfnp:       pfn to be returned               
1653  * @xas:        the dax mapping tree of a fil    
1654  * @entry:      an unlocked dax entry to be i    
1655  * @pmd:        distinguish whether it is a p    
1656  */                                              
1657 static vm_fault_t dax_fault_iter(struct vm_fa    
1658                 const struct iomap_iter *iter    
1659                 struct xa_state *xas, void **    
1660 {                                                
1661         const struct iomap *iomap = &iter->io    
1662         const struct iomap *srcmap = iomap_it    
1663         size_t size = pmd ? PMD_SIZE : PAGE_S    
1664         loff_t pos = (loff_t)xas->xa_index <<    
1665         bool write = iter->flags & IOMAP_WRIT    
1666         unsigned long entry_flags = pmd ? DAX    
1667         int err = 0;                             
1668         pfn_t pfn;                               
1669         void *kaddr;                             
1670                                                  
1671         if (!pmd && vmf->cow_page)               
1672                 return dax_fault_cow_page(vmf    
1673                                                  
1674         /* if we are reading UNWRITTEN and HO    
1675         if (!write &&                            
1676             (iomap->type == IOMAP_UNWRITTEN |    
1677                 if (!pmd)                        
1678                         return dax_load_hole(    
1679                 return dax_pmd_load_hole(xas,    
1680         }                                        
1681                                                  
1682         if (iomap->type != IOMAP_MAPPED && !(    
1683                 WARN_ON_ONCE(1);                 
1684                 return pmd ? VM_FAULT_FALLBAC    
1685         }                                        
1686                                                  
1687         err = dax_iomap_direct_access(iomap,     
1688         if (err)                                 
1689                 return pmd ? VM_FAULT_FALLBAC    
1690                                                  
1691         *entry = dax_insert_entry(xas, vmf, i    
1692                                                  
1693         if (write && iomap->flags & IOMAP_F_S    
1694                 err = dax_iomap_copy_around(p    
1695                 if (err)                         
1696                         return dax_fault_retu    
1697         }                                        
1698                                                  
1699         if (dax_fault_is_synchronous(iter, vm    
1700                 return dax_fault_synchronous_    
1701                                                  
1702         /* insert PMD pfn */                     
1703         if (pmd)                                 
1704                 return vmf_insert_pfn_pmd(vmf    
1705                                                  
1706         /* insert PTE pfn */                     
1707         if (write)                               
1708                 return vmf_insert_mixed_mkwri    
1709         return vmf_insert_mixed(vmf->vma, vmf    
1710 }                                                
1711                                                  
1712 static vm_fault_t dax_iomap_pte_fault(struct     
1713                                int *iomap_err    
1714 {                                                
1715         struct address_space *mapping = vmf->    
1716         XA_STATE(xas, &mapping->i_pages, vmf-    
1717         struct iomap_iter iter = {               
1718                 .inode          = mapping->ho    
1719                 .pos            = (loff_t)vmf    
1720                 .len            = PAGE_SIZE,     
1721                 .flags          = IOMAP_DAX |    
1722         };                                       
1723         vm_fault_t ret = 0;                      
1724         void *entry;                             
1725         int error;                               
1726                                                  
1727         trace_dax_pte_fault(iter.inode, vmf,     
1728         /*                                       
1729          * Check whether offset isn't beyond     
1730          * to hold locks serializing us with     
1731          * a reliable test.                      
1732          */                                      
1733         if (iter.pos >= i_size_read(iter.inod    
1734                 ret = VM_FAULT_SIGBUS;           
1735                 goto out;                        
1736         }                                        
1737                                                  
1738         if ((vmf->flags & FAULT_FLAG_WRITE) &    
1739                 iter.flags |= IOMAP_WRITE;       
1740                                                  
1741         entry = grab_mapping_entry(&xas, mapp    
1742         if (xa_is_internal(entry)) {             
1743                 ret = xa_to_internal(entry);     
1744                 goto out;                        
1745         }                                        
1746                                                  
1747         /*                                       
1748          * It is possible, particularly with     
1749          * mappings, that we have raced with     
1750          * the PTE we need to set up.  If so     
1751          * retried.                              
1752          */                                      
1753         if (pmd_trans_huge(*vmf->pmd) || pmd_    
1754                 ret = VM_FAULT_NOPAGE;           
1755                 goto unlock_entry;               
1756         }                                        
1757                                                  
1758         while ((error = iomap_iter(&iter, ops    
1759                 if (WARN_ON_ONCE(iomap_length    
1760                         iter.processed = -EIO    
1761                         continue;                
1762                 }                                
1763                                                  
1764                 ret = dax_fault_iter(vmf, &it    
1765                 if (ret != VM_FAULT_SIGBUS &&    
1766                     (iter.iomap.flags & IOMAP    
1767                         count_vm_event(PGMAJF    
1768                         count_memcg_event_mm(    
1769                         ret |= VM_FAULT_MAJOR    
1770                 }                                
1771                                                  
1772                 if (!(ret & VM_FAULT_ERROR))     
1773                         iter.processed = PAGE    
1774         }                                        
1775                                                  
1776         if (iomap_errp)                          
1777                 *iomap_errp = error;             
1778         if (!ret && error)                       
1779                 ret = dax_fault_return(error)    
1780                                                  
1781 unlock_entry:                                    
1782         dax_unlock_entry(&xas, entry);           
1783 out:                                             
1784         trace_dax_pte_fault_done(iter.inode,     
1785         return ret;                              
1786 }                                                
1787                                                  
1788 #ifdef CONFIG_FS_DAX_PMD                         
1789 static bool dax_fault_check_fallback(struct v    
1790                 pgoff_t max_pgoff)               
1791 {                                                
1792         unsigned long pmd_addr = vmf->address    
1793         bool write = vmf->flags & FAULT_FLAG_    
1794                                                  
1795         /*                                       
1796          * Make sure that the faulting addres    
1797          * the PMD offset from the start of t    
1798          * that a PMD range in the page table    
1799          * range in the page cache.              
1800          */                                      
1801         if ((vmf->pgoff & PG_PMD_COLOUR) !=      
1802             ((vmf->address >> PAGE_SHIFT) & P    
1803                 return true;                     
1804                                                  
1805         /* Fall back to PTEs if we're going t    
1806         if (write && !(vmf->vma->vm_flags & V    
1807                 return true;                     
1808                                                  
1809         /* If the PMD would extend outside th    
1810         if (pmd_addr < vmf->vma->vm_start)       
1811                 return true;                     
1812         if ((pmd_addr + PMD_SIZE) > vmf->vma-    
1813                 return true;                     
1814                                                  
1815         /* If the PMD would extend beyond the    
1816         if ((xas->xa_index | PG_PMD_COLOUR) >    
1817                 return true;                     
1818                                                  
1819         return false;                            
1820 }                                                
1821                                                  
1822 static vm_fault_t dax_iomap_pmd_fault(struct     
1823                                const struct i    
1824 {                                                
1825         struct address_space *mapping = vmf->    
1826         XA_STATE_ORDER(xas, &mapping->i_pages    
1827         struct iomap_iter iter = {               
1828                 .inode          = mapping->ho    
1829                 .len            = PMD_SIZE,      
1830                 .flags          = IOMAP_DAX |    
1831         };                                       
1832         vm_fault_t ret = VM_FAULT_FALLBACK;      
1833         pgoff_t max_pgoff;                       
1834         void *entry;                             
1835                                                  
1836         if (vmf->flags & FAULT_FLAG_WRITE)       
1837                 iter.flags |= IOMAP_WRITE;       
1838                                                  
1839         /*                                       
1840          * Check whether offset isn't beyond     
1841          * supposed to hold locks serializing    
1842          * this is a reliable test.              
1843          */                                      
1844         max_pgoff = DIV_ROUND_UP(i_size_read(    
1845                                                  
1846         trace_dax_pmd_fault(iter.inode, vmf,     
1847                                                  
1848         if (xas.xa_index >= max_pgoff) {         
1849                 ret = VM_FAULT_SIGBUS;           
1850                 goto out;                        
1851         }                                        
1852                                                  
1853         if (dax_fault_check_fallback(vmf, &xa    
1854                 goto fallback;                   
1855                                                  
1856         /*                                       
1857          * grab_mapping_entry() will make sur    
1858          * a zero PMD entry or a DAX PMD.  If    
1859          * entry is already in the array, for    
1860          * VM_FAULT_FALLBACK.                    
1861          */                                      
1862         entry = grab_mapping_entry(&xas, mapp    
1863         if (xa_is_internal(entry)) {             
1864                 ret = xa_to_internal(entry);     
1865                 goto fallback;                   
1866         }                                        
1867                                                  
1868         /*                                       
1869          * It is possible, particularly with     
1870          * mappings, that we have raced with     
1871          * the PMD we need to set up.  If so     
1872          * retried.                              
1873          */                                      
1874         if (!pmd_none(*vmf->pmd) && !pmd_tran    
1875                         !pmd_devmap(*vmf->pmd    
1876                 ret = 0;                         
1877                 goto unlock_entry;               
1878         }                                        
1879                                                  
1880         iter.pos = (loff_t)xas.xa_index << PA    
1881         while (iomap_iter(&iter, ops) > 0) {     
1882                 if (iomap_length(&iter) < PMD    
1883                         continue; /* actually    
1884                                                  
1885                 ret = dax_fault_iter(vmf, &it    
1886                 if (ret != VM_FAULT_FALLBACK)    
1887                         iter.processed = PMD_    
1888         }                                        
1889                                                  
1890 unlock_entry:                                    
1891         dax_unlock_entry(&xas, entry);           
1892 fallback:                                        
1893         if (ret == VM_FAULT_FALLBACK) {          
1894                 split_huge_pmd(vmf->vma, vmf-    
1895                 count_vm_event(THP_FAULT_FALL    
1896         }                                        
1897 out:                                             
1898         trace_dax_pmd_fault_done(iter.inode,     
1899         return ret;                              
1900 }                                                
1901 #else                                            
1902 static vm_fault_t dax_iomap_pmd_fault(struct     
1903                                const struct i    
1904 {                                                
1905         return VM_FAULT_FALLBACK;                
1906 }                                                
1907 #endif /* CONFIG_FS_DAX_PMD */                   
1908                                                  
1909 /**                                              
1910  * dax_iomap_fault - handle a page fault on a    
1911  * @vmf: The description of the fault            
1912  * @order: Order of the page to fault in         
1913  * @pfnp: PFN to insert for synchronous fault    
1914  * @iomap_errp: Storage for detailed error co    
1915  * @ops: Iomap ops passed from the file syste    
1916  *                                               
1917  * When a page fault occurs, filesystems may     
1918  * their fault handler for DAX files. dax_iom    
1919  * has done all the necessary locking for pag    
1920  * successfully.                                 
1921  */                                              
1922 vm_fault_t dax_iomap_fault(struct vm_fault *v    
1923                     pfn_t *pfnp, int *iomap_e    
1924 {                                                
1925         if (order == 0)                          
1926                 return dax_iomap_pte_fault(vm    
1927         else if (order == PMD_ORDER)             
1928                 return dax_iomap_pmd_fault(vm    
1929         else                                     
1930                 return VM_FAULT_FALLBACK;        
1931 }                                                
1932 EXPORT_SYMBOL_GPL(dax_iomap_fault);              
1933                                                  
1934 /*                                               
1935  * dax_insert_pfn_mkwrite - insert PTE or PMD    
1936  * @vmf: The description of the fault            
1937  * @pfn: PFN to insert                           
1938  * @order: Order of entry to insert.             
1939  *                                               
1940  * This function inserts a writeable PTE or P    
1941  * for an mmaped DAX file.  It also marks the    
1942  */                                              
1943 static vm_fault_t                                
1944 dax_insert_pfn_mkwrite(struct vm_fault *vmf,     
1945 {                                                
1946         struct address_space *mapping = vmf->    
1947         XA_STATE_ORDER(xas, &mapping->i_pages    
1948         void *entry;                             
1949         vm_fault_t ret;                          
1950                                                  
1951         xas_lock_irq(&xas);                      
1952         entry = get_unlocked_entry(&xas, orde    
1953         /* Did we race with someone splitting    
1954         if (!entry || dax_is_conflict(entry)     
1955             (order == 0 && !dax_is_pte_entry(    
1956                 put_unlocked_entry(&xas, entr    
1957                 xas_unlock_irq(&xas);            
1958                 trace_dax_insert_pfn_mkwrite_    
1959                                                  
1960                 return VM_FAULT_NOPAGE;          
1961         }                                        
1962         xas_set_mark(&xas, PAGECACHE_TAG_DIRT    
1963         dax_lock_entry(&xas, entry);             
1964         xas_unlock_irq(&xas);                    
1965         if (order == 0)                          
1966                 ret = vmf_insert_mixed_mkwrit    
1967 #ifdef CONFIG_FS_DAX_PMD                         
1968         else if (order == PMD_ORDER)             
1969                 ret = vmf_insert_pfn_pmd(vmf,    
1970 #endif                                           
1971         else                                     
1972                 ret = VM_FAULT_FALLBACK;         
1973         dax_unlock_entry(&xas, entry);           
1974         trace_dax_insert_pfn_mkwrite(mapping-    
1975         return ret;                              
1976 }                                                
1977                                                  
1978 /**                                              
1979  * dax_finish_sync_fault - finish synchronous    
1980  * @vmf: The description of the fault            
1981  * @order: Order of entry to be inserted         
1982  * @pfn: PFN to insert                           
1983  *                                               
1984  * This function ensures that the file range     
1985  * stored persistently on the media and handl    
1986  * table entry.                                  
1987  */                                              
1988 vm_fault_t dax_finish_sync_fault(struct vm_fa    
1989                 pfn_t pfn)                       
1990 {                                                
1991         int err;                                 
1992         loff_t start = ((loff_t)vmf->pgoff) <    
1993         size_t len = PAGE_SIZE << order;         
1994                                                  
1995         err = vfs_fsync_range(vmf->vma->vm_fi    
1996         if (err)                                 
1997                 return VM_FAULT_SIGBUS;          
1998         return dax_insert_pfn_mkwrite(vmf, pf    
1999 }                                                
2000 EXPORT_SYMBOL_GPL(dax_finish_sync_fault);        
2001                                                  
2002 static loff_t dax_range_compare_iter(struct i    
2003                 struct iomap_iter *it_dest, u    
2004 {                                                
2005         const struct iomap *smap = &it_src->i    
2006         const struct iomap *dmap = &it_dest->    
2007         loff_t pos1 = it_src->pos, pos2 = it_    
2008         void *saddr, *daddr;                     
2009         int id, ret;                             
2010                                                  
2011         len = min(len, min(smap->length, dmap    
2012                                                  
2013         if (smap->type == IOMAP_HOLE && dmap-    
2014                 *same = true;                    
2015                 return len;                      
2016         }                                        
2017                                                  
2018         if (smap->type == IOMAP_HOLE || dmap-    
2019                 *same = false;                   
2020                 return 0;                        
2021         }                                        
2022                                                  
2023         id = dax_read_lock();                    
2024         ret = dax_iomap_direct_access(smap, p    
2025                                       &saddr,    
2026         if (ret < 0)                             
2027                 goto out_unlock;                 
2028                                                  
2029         ret = dax_iomap_direct_access(dmap, p    
2030                                       &daddr,    
2031         if (ret < 0)                             
2032                 goto out_unlock;                 
2033                                                  
2034         *same = !memcmp(saddr, daddr, len);      
2035         if (!*same)                              
2036                 len = 0;                         
2037         dax_read_unlock(id);                     
2038         return len;                              
2039                                                  
2040 out_unlock:                                      
2041         dax_read_unlock(id);                     
2042         return -EIO;                             
2043 }                                                
2044                                                  
2045 int dax_dedupe_file_range_compare(struct inod    
2046                 struct inode *dst, loff_t dst    
2047                 const struct iomap_ops *ops)     
2048 {                                                
2049         struct iomap_iter src_iter = {           
2050                 .inode          = src,           
2051                 .pos            = srcoff,        
2052                 .len            = len,           
2053                 .flags          = IOMAP_DAX,     
2054         };                                       
2055         struct iomap_iter dst_iter = {           
2056                 .inode          = dst,           
2057                 .pos            = dstoff,        
2058                 .len            = len,           
2059                 .flags          = IOMAP_DAX,     
2060         };                                       
2061         int ret, compared = 0;                   
2062                                                  
2063         while ((ret = iomap_iter(&src_iter, o    
2064                (ret = iomap_iter(&dst_iter, o    
2065                 compared = dax_range_compare_    
2066                                 min(src_iter.    
2067                 if (compared < 0)                
2068                         return ret;              
2069                 src_iter.processed = dst_iter    
2070         }                                        
2071         return ret;                              
2072 }                                                
2073                                                  
2074 int dax_remap_file_range_prep(struct file *fi    
2075                               struct file *fi    
2076                               loff_t *len, un    
2077                               const struct io    
2078 {                                                
2079         return __generic_remap_file_range_pre    
2080                                                  
2081 }                                                
2082 EXPORT_SYMBOL_GPL(dax_remap_file_range_prep);    
2083                                                  

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php