~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
hmm.c

Version: ~ [ linux-6.12-rc7 ] ~ [ linux-6.11.7 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.60 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.116 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.171 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.229 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.285 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.323 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.12 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~
Diff markup

Differences between /mm/hmm.c (Version linux-6.12-rc7) and /mm/hmm.c (Version linux-4.17.19)

  1 // SPDX-License-Identifier: GPL-2.0-or-later   << 
  2 /*                                                  1 /*
  3  * Copyright 2013 Red Hat Inc.                      2  * Copyright 2013 Red Hat Inc.
  4  *                                                  3  *
  5  * Authors: Jérôme Glisse <jglisse@redhat.co !!   4  * This program is free software; you can redistribute it and/or modify
                                                   >>   5  * it under the terms of the GNU General Public License as published by
                                                   >>   6  * the Free Software Foundation; either version 2 of the License, or
                                                   >>   7  * (at your option) any later version.
                                                   >>   8  *
                                                   >>   9  * This program is distributed in the hope that it will be useful,
                                                   >>  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
                                                   >>  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
                                                   >>  12  * GNU General Public License for more details.
                                                   >>  13  *
                                                   >>  14  * Authors: JÃ©rÃ´me Glisse <jglisse@redhat.com>
  6  */                                                15  */
  7 /*                                                 16 /*
  8  * Refer to include/linux/hmm.h for informatio     17  * Refer to include/linux/hmm.h for information about heterogeneous memory
  9  * management or HMM for short.                    18  * management or HMM for short.
 10  */                                                19  */
 11 #include <linux/pagewalk.h>                    !!  20 #include <linux/mm.h>
 12 #include <linux/hmm.h>                             21 #include <linux/hmm.h>
 13 #include <linux/init.h>                            22 #include <linux/init.h>
 14 #include <linux/rmap.h>                            23 #include <linux/rmap.h>
 15 #include <linux/swap.h>                            24 #include <linux/swap.h>
 16 #include <linux/slab.h>                            25 #include <linux/slab.h>
 17 #include <linux/sched.h>                           26 #include <linux/sched.h>
 18 #include <linux/mmzone.h>                          27 #include <linux/mmzone.h>
 19 #include <linux/pagemap.h>                         28 #include <linux/pagemap.h>
 20 #include <linux/swapops.h>                         29 #include <linux/swapops.h>
 21 #include <linux/hugetlb.h>                         30 #include <linux/hugetlb.h>
 22 #include <linux/memremap.h>                        31 #include <linux/memremap.h>
 23 #include <linux/sched/mm.h>                    << 
 24 #include <linux/jump_label.h>                      32 #include <linux/jump_label.h>
 25 #include <linux/dma-mapping.h>                 << 
 26 #include <linux/mmu_notifier.h>                    33 #include <linux/mmu_notifier.h>
 27 #include <linux/memory_hotplug.h>                  34 #include <linux/memory_hotplug.h>
 28                                                    35 
 29 #include "internal.h"                          !!  36 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT)
                                                   >>  37 
                                                   >>  38 #if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC)
                                                   >>  39 /*
                                                   >>  40  * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h
                                                   >>  41  */
                                                   >>  42 DEFINE_STATIC_KEY_FALSE(device_private_key);
                                                   >>  43 EXPORT_SYMBOL(device_private_key);
                                                   >>  44 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
                                                   >>  45 
                                                   >>  46 
                                                   >>  47 #if IS_ENABLED(CONFIG_HMM_MIRROR)
                                                   >>  48 static const struct mmu_notifier_ops hmm_mmu_notifier_ops;
                                                   >>  49 
                                                   >>  50 /*
                                                   >>  51  * struct hmm - HMM per mm struct
                                                   >>  52  *
                                                   >>  53  * @mm: mm struct this HMM struct is bound to
                                                   >>  54  * @lock: lock protecting ranges list
                                                   >>  55  * @sequence: we track updates to the CPU page table with a sequence number
                                                   >>  56  * @ranges: list of range being snapshotted
                                                   >>  57  * @mirrors: list of mirrors for this mm
                                                   >>  58  * @mmu_notifier: mmu notifier to track updates to CPU page table
                                                   >>  59  * @mirrors_sem: read/write semaphore protecting the mirrors list
                                                   >>  60  */
                                                   >>  61 struct hmm {
                                                   >>  62         struct mm_struct        *mm;
                                                   >>  63         spinlock_t              lock;
                                                   >>  64         atomic_t                sequence;
                                                   >>  65         struct list_head        ranges;
                                                   >>  66         struct list_head        mirrors;
                                                   >>  67         struct mmu_notifier     mmu_notifier;
                                                   >>  68         struct rw_semaphore     mirrors_sem;
                                                   >>  69 };
                                                   >>  70 
                                                   >>  71 /*
                                                   >>  72  * hmm_register - register HMM against an mm (HMM internal)
                                                   >>  73  *
                                                   >>  74  * @mm: mm struct to attach to
                                                   >>  75  *
                                                   >>  76  * This is not intended to be used directly by device drivers. It allocates an
                                                   >>  77  * HMM struct if mm does not have one, and initializes it.
                                                   >>  78  */
                                                   >>  79 static struct hmm *hmm_register(struct mm_struct *mm)
                                                   >>  80 {
                                                   >>  81         struct hmm *hmm = READ_ONCE(mm->hmm);
                                                   >>  82         bool cleanup = false;
                                                   >>  83 
                                                   >>  84         /*
                                                   >>  85          * The hmm struct can only be freed once the mm_struct goes away,
                                                   >>  86          * hence we should always have pre-allocated an new hmm struct
                                                   >>  87          * above.
                                                   >>  88          */
                                                   >>  89         if (hmm)
                                                   >>  90                 return hmm;
                                                   >>  91 
                                                   >>  92         hmm = kmalloc(sizeof(*hmm), GFP_KERNEL);
                                                   >>  93         if (!hmm)
                                                   >>  94                 return NULL;
                                                   >>  95         INIT_LIST_HEAD(&hmm->mirrors);
                                                   >>  96         init_rwsem(&hmm->mirrors_sem);
                                                   >>  97         atomic_set(&hmm->sequence, 0);
                                                   >>  98         hmm->mmu_notifier.ops = NULL;
                                                   >>  99         INIT_LIST_HEAD(&hmm->ranges);
                                                   >> 100         spin_lock_init(&hmm->lock);
                                                   >> 101         hmm->mm = mm;
                                                   >> 102 
                                                   >> 103         /*
                                                   >> 104          * We should only get here if hold the mmap_sem in write mode ie on
                                                   >> 105          * registration of first mirror through hmm_mirror_register()
                                                   >> 106          */
                                                   >> 107         hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops;
                                                   >> 108         if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) {
                                                   >> 109                 kfree(hmm);
                                                   >> 110                 return NULL;
                                                   >> 111         }
                                                   >> 112 
                                                   >> 113         spin_lock(&mm->page_table_lock);
                                                   >> 114         if (!mm->hmm)
                                                   >> 115                 mm->hmm = hmm;
                                                   >> 116         else
                                                   >> 117                 cleanup = true;
                                                   >> 118         spin_unlock(&mm->page_table_lock);
                                                   >> 119 
                                                   >> 120         if (cleanup) {
                                                   >> 121                 mmu_notifier_unregister(&hmm->mmu_notifier, mm);
                                                   >> 122                 kfree(hmm);
                                                   >> 123         }
                                                   >> 124 
                                                   >> 125         return mm->hmm;
                                                   >> 126 }
                                                   >> 127 
                                                   >> 128 void hmm_mm_destroy(struct mm_struct *mm)
                                                   >> 129 {
                                                   >> 130         kfree(mm->hmm);
                                                   >> 131 }
                                                   >> 132 
                                                   >> 133 static void hmm_invalidate_range(struct hmm *hmm,
                                                   >> 134                                  enum hmm_update_type action,
                                                   >> 135                                  unsigned long start,
                                                   >> 136                                  unsigned long end)
                                                   >> 137 {
                                                   >> 138         struct hmm_mirror *mirror;
                                                   >> 139         struct hmm_range *range;
                                                   >> 140 
                                                   >> 141         spin_lock(&hmm->lock);
                                                   >> 142         list_for_each_entry(range, &hmm->ranges, list) {
                                                   >> 143                 unsigned long addr, idx, npages;
                                                   >> 144 
                                                   >> 145                 if (end < range->start || start >= range->end)
                                                   >> 146                         continue;
                                                   >> 147 
                                                   >> 148                 range->valid = false;
                                                   >> 149                 addr = max(start, range->start);
                                                   >> 150                 idx = (addr - range->start) >> PAGE_SHIFT;
                                                   >> 151                 npages = (min(range->end, end) - addr) >> PAGE_SHIFT;
                                                   >> 152                 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages);
                                                   >> 153         }
                                                   >> 154         spin_unlock(&hmm->lock);
                                                   >> 155 
                                                   >> 156         down_read(&hmm->mirrors_sem);
                                                   >> 157         list_for_each_entry(mirror, &hmm->mirrors, list)
                                                   >> 158                 mirror->ops->sync_cpu_device_pagetables(mirror, action,
                                                   >> 159                                                         start, end);
                                                   >> 160         up_read(&hmm->mirrors_sem);
                                                   >> 161 }
                                                   >> 162 
                                                   >> 163 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm)
                                                   >> 164 {
                                                   >> 165         struct hmm_mirror *mirror;
                                                   >> 166         struct hmm *hmm = mm->hmm;
                                                   >> 167 
                                                   >> 168         down_write(&hmm->mirrors_sem);
                                                   >> 169         mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror,
                                                   >> 170                                           list);
                                                   >> 171         while (mirror) {
                                                   >> 172                 list_del_init(&mirror->list);
                                                   >> 173                 if (mirror->ops->release) {
                                                   >> 174                         /*
                                                   >> 175                          * Drop mirrors_sem so callback can wait on any pending
                                                   >> 176                          * work that might itself trigger mmu_notifier callback
                                                   >> 177                          * and thus would deadlock with us.
                                                   >> 178                          */
                                                   >> 179                         up_write(&hmm->mirrors_sem);
                                                   >> 180                         mirror->ops->release(mirror);
                                                   >> 181                         down_write(&hmm->mirrors_sem);
                                                   >> 182                 }
                                                   >> 183                 mirror = list_first_entry_or_null(&hmm->mirrors,
                                                   >> 184                                                   struct hmm_mirror, list);
                                                   >> 185         }
                                                   >> 186         up_write(&hmm->mirrors_sem);
                                                   >> 187 }
                                                   >> 188 
                                                   >> 189 static void hmm_invalidate_range_start(struct mmu_notifier *mn,
                                                   >> 190                                        struct mm_struct *mm,
                                                   >> 191                                        unsigned long start,
                                                   >> 192                                        unsigned long end)
                                                   >> 193 {
                                                   >> 194         struct hmm *hmm = mm->hmm;
                                                   >> 195 
                                                   >> 196         VM_BUG_ON(!hmm);
                                                   >> 197 
                                                   >> 198         atomic_inc(&hmm->sequence);
                                                   >> 199 }
                                                   >> 200 
                                                   >> 201 static void hmm_invalidate_range_end(struct mmu_notifier *mn,
                                                   >> 202                                      struct mm_struct *mm,
                                                   >> 203                                      unsigned long start,
                                                   >> 204                                      unsigned long end)
                                                   >> 205 {
                                                   >> 206         struct hmm *hmm = mm->hmm;
                                                   >> 207 
                                                   >> 208         VM_BUG_ON(!hmm);
                                                   >> 209 
                                                   >> 210         hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end);
                                                   >> 211 }
                                                   >> 212 
                                                   >> 213 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = {
                                                   >> 214         .release                = hmm_release,
                                                   >> 215         .invalidate_range_start = hmm_invalidate_range_start,
                                                   >> 216         .invalidate_range_end   = hmm_invalidate_range_end,
                                                   >> 217 };
                                                   >> 218 
                                                   >> 219 /*
                                                   >> 220  * hmm_mirror_register() - register a mirror against an mm
                                                   >> 221  *
                                                   >> 222  * @mirror: new mirror struct to register
                                                   >> 223  * @mm: mm to register against
                                                   >> 224  *
                                                   >> 225  * To start mirroring a process address space, the device driver must register
                                                   >> 226  * an HMM mirror struct.
                                                   >> 227  *
                                                   >> 228  * THE mm->mmap_sem MUST BE HELD IN WRITE MODE !
                                                   >> 229  */
                                                   >> 230 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm)
                                                   >> 231 {
                                                   >> 232         /* Sanity check */
                                                   >> 233         if (!mm || !mirror || !mirror->ops)
                                                   >> 234                 return -EINVAL;
                                                   >> 235 
                                                   >> 236 again:
                                                   >> 237         mirror->hmm = hmm_register(mm);
                                                   >> 238         if (!mirror->hmm)
                                                   >> 239                 return -ENOMEM;
                                                   >> 240 
                                                   >> 241         down_write(&mirror->hmm->mirrors_sem);
                                                   >> 242         if (mirror->hmm->mm == NULL) {
                                                   >> 243                 /*
                                                   >> 244                  * A racing hmm_mirror_unregister() is about to destroy the hmm
                                                   >> 245                  * struct. Try again to allocate a new one.
                                                   >> 246                  */
                                                   >> 247                 up_write(&mirror->hmm->mirrors_sem);
                                                   >> 248                 mirror->hmm = NULL;
                                                   >> 249                 goto again;
                                                   >> 250         } else {
                                                   >> 251                 list_add(&mirror->list, &mirror->hmm->mirrors);
                                                   >> 252                 up_write(&mirror->hmm->mirrors_sem);
                                                   >> 253         }
                                                   >> 254 
                                                   >> 255         return 0;
                                                   >> 256 }
                                                   >> 257 EXPORT_SYMBOL(hmm_mirror_register);
                                                   >> 258 
                                                   >> 259 /*
                                                   >> 260  * hmm_mirror_unregister() - unregister a mirror
                                                   >> 261  *
                                                   >> 262  * @mirror: new mirror struct to register
                                                   >> 263  *
                                                   >> 264  * Stop mirroring a process address space, and cleanup.
                                                   >> 265  */
                                                   >> 266 void hmm_mirror_unregister(struct hmm_mirror *mirror)
                                                   >> 267 {
                                                   >> 268         bool should_unregister = false;
                                                   >> 269         struct mm_struct *mm;
                                                   >> 270         struct hmm *hmm;
                                                   >> 271 
                                                   >> 272         if (mirror->hmm == NULL)
                                                   >> 273                 return;
                                                   >> 274 
                                                   >> 275         hmm = mirror->hmm;
                                                   >> 276         down_write(&hmm->mirrors_sem);
                                                   >> 277         list_del_init(&mirror->list);
                                                   >> 278         should_unregister = list_empty(&hmm->mirrors);
                                                   >> 279         mirror->hmm = NULL;
                                                   >> 280         mm = hmm->mm;
                                                   >> 281         hmm->mm = NULL;
                                                   >> 282         up_write(&hmm->mirrors_sem);
                                                   >> 283 
                                                   >> 284         if (!should_unregister || mm == NULL)
                                                   >> 285                 return;
                                                   >> 286 
                                                   >> 287         spin_lock(&mm->page_table_lock);
                                                   >> 288         if (mm->hmm == hmm)
                                                   >> 289                 mm->hmm = NULL;
                                                   >> 290         spin_unlock(&mm->page_table_lock);
                                                   >> 291 
                                                   >> 292         mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm);
                                                   >> 293         kfree(hmm);
                                                   >> 294 }
                                                   >> 295 EXPORT_SYMBOL(hmm_mirror_unregister);
 30                                                   296 
 31 struct hmm_vma_walk {                             297 struct hmm_vma_walk {
 32         struct hmm_range        *range;           298         struct hmm_range        *range;
 33         unsigned long           last;             299         unsigned long           last;
                                                   >> 300         bool                    fault;
                                                   >> 301         bool                    block;
 34 };                                                302 };
 35                                                   303 
 36 enum {                                         !! 304 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr,
 37         HMM_NEED_FAULT = 1 << 0,               !! 305                             bool write_fault, uint64_t *pfn)
 38         HMM_NEED_WRITE_FAULT = 1 << 1,         !! 306 {
 39         HMM_NEED_ALL_BITS = HMM_NEED_FAULT | H !! 307         unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE;
 40 };                                             !! 308         struct hmm_vma_walk *hmm_vma_walk = walk->private;
                                                   >> 309         struct hmm_range *range = hmm_vma_walk->range;
                                                   >> 310         struct vm_area_struct *vma = walk->vma;
                                                   >> 311         int r;
                                                   >> 312 
                                                   >> 313         flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY;
                                                   >> 314         flags |= write_fault ? FAULT_FLAG_WRITE : 0;
                                                   >> 315         r = handle_mm_fault(vma, addr, flags);
                                                   >> 316         if (r & VM_FAULT_RETRY)
                                                   >> 317                 return -EBUSY;
                                                   >> 318         if (r & VM_FAULT_ERROR) {
                                                   >> 319                 *pfn = range->values[HMM_PFN_ERROR];
                                                   >> 320                 return -EFAULT;
                                                   >> 321         }
                                                   >> 322 
                                                   >> 323         return -EAGAIN;
                                                   >> 324 }
 41                                                   325 
 42 static int hmm_pfns_fill(unsigned long addr, u !! 326 static int hmm_pfns_bad(unsigned long addr,
 43                          struct hmm_range *ran !! 327                         unsigned long end,
                                                   >> 328                         struct mm_walk *walk)
 44 {                                                 329 {
 45         unsigned long i = (addr - range->start !! 330         struct hmm_vma_walk *hmm_vma_walk = walk->private;
                                                   >> 331         struct hmm_range *range = hmm_vma_walk->range;
                                                   >> 332         uint64_t *pfns = range->pfns;
                                                   >> 333         unsigned long i;
 46                                                   334 
                                                   >> 335         i = (addr - range->start) >> PAGE_SHIFT;
 47         for (; addr < end; addr += PAGE_SIZE,     336         for (; addr < end; addr += PAGE_SIZE, i++)
 48                 range->hmm_pfns[i] = cpu_flags !! 337                 pfns[i] = range->values[HMM_PFN_ERROR];
                                                   >> 338 
 49         return 0;                                 339         return 0;
 50 }                                                 340 }
 51                                                   341 
 52 /*                                                342 /*
 53  * hmm_vma_fault() - fault in a range lacking  !! 343  * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s)
 54  * @addr: range virtual start address (inclusi !! 344  * @start: range virtual start address (inclusive)
 55  * @end: range virtual end address (exclusive)    345  * @end: range virtual end address (exclusive)
 56  * @required_fault: HMM_NEED_* flags           !! 346  * @fault: should we fault or not ?
                                                   >> 347  * @write_fault: write fault ?
 57  * @walk: mm_walk structure                       348  * @walk: mm_walk structure
 58  * Return: -EBUSY after page fault, or page fa !! 349  * Returns: 0 on success, -EAGAIN after page fault, or page fault error
 59  *                                                350  *
 60  * This function will be called whenever pmd_n    351  * This function will be called whenever pmd_none() or pte_none() returns true,
 61  * or whenever there is no page directory cove    352  * or whenever there is no page directory covering the virtual address range.
 62  */                                               353  */
 63 static int hmm_vma_fault(unsigned long addr, u !! 354 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end,
 64                          unsigned int required !! 355                               bool fault, bool write_fault,
                                                   >> 356                               struct mm_walk *walk)
 65 {                                                 357 {
 66         struct hmm_vma_walk *hmm_vma_walk = wa    358         struct hmm_vma_walk *hmm_vma_walk = walk->private;
 67         struct vm_area_struct *vma = walk->vma !! 359         struct hmm_range *range = hmm_vma_walk->range;
 68         unsigned int fault_flags = FAULT_FLAG_ !! 360         uint64_t *pfns = range->pfns;
                                                   >> 361         unsigned long i;
 69                                                   362 
 70         WARN_ON_ONCE(!required_fault);         << 
 71         hmm_vma_walk->last = addr;                363         hmm_vma_walk->last = addr;
                                                   >> 364         i = (addr - range->start) >> PAGE_SHIFT;
                                                   >> 365         for (; addr < end; addr += PAGE_SIZE, i++) {
                                                   >> 366                 pfns[i] = range->values[HMM_PFN_NONE];
                                                   >> 367                 if (fault || write_fault) {
                                                   >> 368                         int ret;
                                                   >> 369 
                                                   >> 370                         ret = hmm_vma_do_fault(walk, addr, write_fault,
                                                   >> 371                                                &pfns[i]);
                                                   >> 372                         if (ret != -EAGAIN)
                                                   >> 373                                 return ret;
                                                   >> 374                 }
                                                   >> 375         }
 72                                                   376 
 73         if (required_fault & HMM_NEED_WRITE_FA !! 377         return (fault || write_fault) ? -EAGAIN : 0;
 74                 if (!(vma->vm_flags & VM_WRITE !! 378 }
 75                         return -EPERM;         !! 379 
 76                 fault_flags |= FAULT_FLAG_WRIT !! 380 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
 77         }                                      !! 381                                       uint64_t pfns, uint64_t cpu_flags,
 78                                                !! 382                                       bool *fault, bool *write_fault)
 79         for (; addr < end; addr += PAGE_SIZE)  << 
 80                 if (handle_mm_fault(vma, addr, << 
 81                     VM_FAULT_ERROR)            << 
 82                         return -EFAULT;        << 
 83         return -EBUSY;                         << 
 84 }                                              << 
 85                                                << 
 86 static unsigned int hmm_pte_need_fault(const s << 
 87                                        unsigne << 
 88                                        unsigne << 
 89 {                                                 383 {
 90         struct hmm_range *range = hmm_vma_walk    384         struct hmm_range *range = hmm_vma_walk->range;
 91                                                   385 
 92         /*                                     !! 386         *fault = *write_fault = false;
 93          * So we not only consider the individ !! 387         if (!hmm_vma_walk->fault)
 94          * consider the default flags requeste !! 388                 return;
 95          * be used 2 ways. The first one where << 
 96          * multiple page faults into one reque << 
 97          * those faults. The second one where  << 
 98          * fault a range with specific flags.  << 
 99          * waste to have the user pre-fill the << 
100          * flags value.                        << 
101          */                                    << 
102         pfn_req_flags &= range->pfn_flags_mask << 
103         pfn_req_flags |= range->default_flags; << 
104                                                   389 
105         /* We aren't ask to do anything ... */    390         /* We aren't ask to do anything ... */
106         if (!(pfn_req_flags & HMM_PFN_REQ_FAUL !! 391         if (!(pfns & range->flags[HMM_PFN_VALID]))
107                 return 0;                      !! 392                 return;
108                                                !! 393         /* If this is device memory than only fault if explicitly requested */
109         /* Need to write fault ? */            !! 394         if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) {
110         if ((pfn_req_flags & HMM_PFN_REQ_WRITE !! 395                 /* Do we fault on device memory ? */
111             !(cpu_flags & HMM_PFN_WRITE))      !! 396                 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) {
112                 return HMM_NEED_FAULT | HMM_NE !! 397                         *write_fault = pfns & range->flags[HMM_PFN_WRITE];
                                                   >> 398                         *fault = true;
                                                   >> 399                 }
                                                   >> 400                 return;
                                                   >> 401         }
113                                                   402 
114         /* If CPU page table is not valid then    403         /* If CPU page table is not valid then we need to fault */
115         if (!(cpu_flags & HMM_PFN_VALID))      !! 404         *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]);
116                 return HMM_NEED_FAULT;         !! 405         /* Need to write fault ? */
117         return 0;                              !! 406         if ((pfns & range->flags[HMM_PFN_WRITE]) &&
                                                   >> 407             !(cpu_flags & range->flags[HMM_PFN_WRITE])) {
                                                   >> 408                 *write_fault = true;
                                                   >> 409                 *fault = true;
                                                   >> 410         }
118 }                                                 411 }
119                                                   412 
120 static unsigned int                            !! 413 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
121 hmm_range_need_fault(const struct hmm_vma_walk !! 414                                  const uint64_t *pfns, unsigned long npages,
122                      const unsigned long hmm_p !! 415                                  uint64_t cpu_flags, bool *fault,
123                      unsigned long cpu_flags)  !! 416                                  bool *write_fault)
124 {                                                 417 {
125         struct hmm_range *range = hmm_vma_walk << 
126         unsigned int required_fault = 0;       << 
127         unsigned long i;                          418         unsigned long i;
128                                                   419 
129         /*                                     !! 420         if (!hmm_vma_walk->fault) {
130          * If the default flags do not request !! 421                 *fault = *write_fault = false;
131          * not allow for individual pages to b !! 422                 return;
132          * hmm_pte_need_fault() will always re !! 423         }
133          */                                    << 
134         if (!((range->default_flags | range->p << 
135               HMM_PFN_REQ_FAULT))              << 
136                 return 0;                      << 
137                                                   424 
138         for (i = 0; i < npages; ++i) {            425         for (i = 0; i < npages; ++i) {
139                 required_fault |= hmm_pte_need !! 426                 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags,
140                                                !! 427                                    fault, write_fault);
141                 if (required_fault == HMM_NEED !! 428                 if ((*fault) || (*write_fault))
142                         return required_fault; !! 429                         return;
143         }                                         430         }
144         return required_fault;                 << 
145 }                                                 431 }
146                                                   432 
147 static int hmm_vma_walk_hole(unsigned long add    433 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
148                              __always_unused i !! 434                              struct mm_walk *walk)
149 {                                                 435 {
150         struct hmm_vma_walk *hmm_vma_walk = wa    436         struct hmm_vma_walk *hmm_vma_walk = walk->private;
151         struct hmm_range *range = hmm_vma_walk    437         struct hmm_range *range = hmm_vma_walk->range;
152         unsigned int required_fault;           !! 438         bool fault, write_fault;
153         unsigned long i, npages;                  439         unsigned long i, npages;
154         unsigned long *hmm_pfns;               !! 440         uint64_t *pfns;
155                                                   441 
156         i = (addr - range->start) >> PAGE_SHIF    442         i = (addr - range->start) >> PAGE_SHIFT;
157         npages = (end - addr) >> PAGE_SHIFT;      443         npages = (end - addr) >> PAGE_SHIFT;
158         hmm_pfns = &range->hmm_pfns[i];        !! 444         pfns = &range->pfns[i];
159         required_fault =                       !! 445         hmm_range_need_fault(hmm_vma_walk, pfns, npages,
160                 hmm_range_need_fault(hmm_vma_w !! 446                              0, &fault, &write_fault);
161         if (!walk->vma) {                      !! 447         return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
162                 if (required_fault)            << 
163                         return -EFAULT;        << 
164                 return hmm_pfns_fill(addr, end << 
165         }                                      << 
166         if (required_fault)                    << 
167                 return hmm_vma_fault(addr, end << 
168         return hmm_pfns_fill(addr, end, range, << 
169 }                                              << 
170                                                << 
171 static inline unsigned long hmm_pfn_flags_orde << 
172 {                                              << 
173         return order << HMM_PFN_ORDER_SHIFT;   << 
174 }                                                 448 }
175                                                   449 
176 static inline unsigned long pmd_to_hmm_pfn_fla !! 450 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
177                                                << 
178 {                                                 451 {
179         if (pmd_protnone(pmd))                    452         if (pmd_protnone(pmd))
180                 return 0;                         453                 return 0;
181         return (pmd_write(pmd) ? (HMM_PFN_VALI !! 454         return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
182                                  HMM_PFN_VALID !! 455                                 range->flags[HMM_PFN_WRITE] :
183                hmm_pfn_flags_order(PMD_SHIFT - !! 456                                 range->flags[HMM_PFN_VALID];
184 }                                                 457 }
185                                                   458 
186 #ifdef CONFIG_TRANSPARENT_HUGEPAGE             !! 459 static int hmm_vma_handle_pmd(struct mm_walk *walk,
187 static int hmm_vma_handle_pmd(struct mm_walk * !! 460                               unsigned long addr,
188                               unsigned long en !! 461                               unsigned long end,
                                                   >> 462                               uint64_t *pfns,
189                               pmd_t pmd)          463                               pmd_t pmd)
190 {                                                 464 {
191         struct hmm_vma_walk *hmm_vma_walk = wa    465         struct hmm_vma_walk *hmm_vma_walk = walk->private;
192         struct hmm_range *range = hmm_vma_walk    466         struct hmm_range *range = hmm_vma_walk->range;
193         unsigned long pfn, npages, i;             467         unsigned long pfn, npages, i;
194         unsigned int required_fault;           !! 468         bool fault, write_fault;
195         unsigned long cpu_flags;               !! 469         uint64_t cpu_flags;
196                                                   470 
197         npages = (end - addr) >> PAGE_SHIFT;      471         npages = (end - addr) >> PAGE_SHIFT;
198         cpu_flags = pmd_to_hmm_pfn_flags(range    472         cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
199         required_fault =                       !! 473         hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags,
200                 hmm_range_need_fault(hmm_vma_w !! 474                              &fault, &write_fault);
201         if (required_fault)                    !! 475 
202                 return hmm_vma_fault(addr, end !! 476         if (pmd_protnone(pmd) || fault || write_fault)
                                                   >> 477                 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
203                                                   478 
204         pfn = pmd_pfn(pmd) + ((addr & ~PMD_MAS !! 479         pfn = pmd_pfn(pmd) + pte_index(addr);
205         for (i = 0; addr < end; addr += PAGE_S    480         for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
206                 hmm_pfns[i] = pfn | cpu_flags; !! 481                 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags;
                                                   >> 482         hmm_vma_walk->last = end;
207         return 0;                                 483         return 0;
208 }                                                 484 }
209 #else /* CONFIG_TRANSPARENT_HUGEPAGE */        << 
210 /* stub to allow the code below to compile */  << 
211 int hmm_vma_handle_pmd(struct mm_walk *walk, u << 
212                 unsigned long end, unsigned lo << 
213 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */       << 
214                                                   485 
215 static inline unsigned long pte_to_hmm_pfn_fla !! 486 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
216                                                << 
217 {                                                 487 {
218         if (pte_none(pte) || !pte_present(pte) !! 488         if (pte_none(pte) || !pte_present(pte))
219                 return 0;                         489                 return 0;
220         return pte_write(pte) ? (HMM_PFN_VALID !! 490         return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
                                                   >> 491                                 range->flags[HMM_PFN_WRITE] :
                                                   >> 492                                 range->flags[HMM_PFN_VALID];
221 }                                                 493 }
222                                                   494 
223 static int hmm_vma_handle_pte(struct mm_walk *    495 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
224                               unsigned long en    496                               unsigned long end, pmd_t *pmdp, pte_t *ptep,
225                               unsigned long *h !! 497                               uint64_t *pfn)
226 {                                                 498 {
227         struct hmm_vma_walk *hmm_vma_walk = wa    499         struct hmm_vma_walk *hmm_vma_walk = walk->private;
228         struct hmm_range *range = hmm_vma_walk    500         struct hmm_range *range = hmm_vma_walk->range;
229         unsigned int required_fault;           !! 501         struct vm_area_struct *vma = walk->vma;
230         unsigned long cpu_flags;               !! 502         bool fault, write_fault;
231         pte_t pte = ptep_get(ptep);            !! 503         uint64_t cpu_flags;
232         uint64_t pfn_req_flags = *hmm_pfn;     !! 504         pte_t pte = *ptep;
233                                                !! 505         uint64_t orig_pfn = *pfn;
234         if (pte_none_mostly(pte)) {            !! 506 
235                 required_fault =               !! 507         *pfn = range->values[HMM_PFN_NONE];
236                         hmm_pte_need_fault(hmm !! 508         cpu_flags = pte_to_hmm_pfn_flags(range, pte);
237                 if (required_fault)            !! 509         hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
                                                   >> 510                            &fault, &write_fault);
                                                   >> 511 
                                                   >> 512         if (pte_none(pte)) {
                                                   >> 513                 if (fault || write_fault)
238                         goto fault;               514                         goto fault;
239                 *hmm_pfn = 0;                  << 
240                 return 0;                         515                 return 0;
241         }                                         516         }
242                                                   517 
243         if (!pte_present(pte)) {                  518         if (!pte_present(pte)) {
244                 swp_entry_t entry = pte_to_swp    519                 swp_entry_t entry = pte_to_swp_entry(pte);
245                                                   520 
246                 /*                             !! 521                 if (!non_swap_entry(entry)) {
247                  * Don't fault in device priva !! 522                         if (fault || write_fault)
248                  * just report the PFN.        !! 523                                 goto fault;
249                  */                            << 
250                 if (is_device_private_entry(en << 
251                     pfn_swap_entry_to_page(ent << 
252                     range->dev_private_owner)  << 
253                         cpu_flags = HMM_PFN_VA << 
254                         if (is_writable_device << 
255                                 cpu_flags |= H << 
256                         *hmm_pfn = swp_offset_ << 
257                         return 0;                 524                         return 0;
258                 }                                 525                 }
259                                                   526 
260                 required_fault =               !! 527                 /*
261                         hmm_pte_need_fault(hmm !! 528                  * This is a special swap entry, ignore migration, use
262                 if (!required_fault) {         !! 529                  * device and report anything else as error.
263                         *hmm_pfn = 0;          !! 530                  */
                                                   >> 531                 if (is_device_private_entry(entry)) {
                                                   >> 532                         cpu_flags = range->flags[HMM_PFN_VALID] |
                                                   >> 533                                 range->flags[HMM_PFN_DEVICE_PRIVATE];
                                                   >> 534                         cpu_flags |= is_write_device_private_entry(entry) ?
                                                   >> 535                                 range->flags[HMM_PFN_WRITE] : 0;
                                                   >> 536                         hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags,
                                                   >> 537                                            &fault, &write_fault);
                                                   >> 538                         if (fault || write_fault)
                                                   >> 539                                 goto fault;
                                                   >> 540                         *pfn = hmm_pfn_from_pfn(range, swp_offset(entry));
                                                   >> 541                         *pfn |= cpu_flags;
264                         return 0;                 542                         return 0;
265                 }                                 543                 }
266                                                   544 
267                 if (!non_swap_entry(entry))    << 
268                         goto fault;            << 
269                                                << 
270                 if (is_device_private_entry(en << 
271                         goto fault;            << 
272                                                << 
273                 if (is_device_exclusive_entry( << 
274                         goto fault;            << 
275                                                << 
276                 if (is_migration_entry(entry))    545                 if (is_migration_entry(entry)) {
277                         pte_unmap(ptep);       !! 546                         if (fault || write_fault) {
278                         hmm_vma_walk->last = a !! 547                                 pte_unmap(ptep);
279                         migration_entry_wait(w !! 548                                 hmm_vma_walk->last = addr;
280                         return -EBUSY;         !! 549                                 migration_entry_wait(vma->vm_mm,
                                                   >> 550                                                      pmdp, addr);
                                                   >> 551                                 return -EAGAIN;
                                                   >> 552                         }
                                                   >> 553                         return 0;
281                 }                                 554                 }
282                                                   555 
283                 /* Report error for everything    556                 /* Report error for everything else */
284                 pte_unmap(ptep);               !! 557                 *pfn = range->values[HMM_PFN_ERROR];
285                 return -EFAULT;                   558                 return -EFAULT;
286         }                                         559         }
287                                                   560 
288         cpu_flags = pte_to_hmm_pfn_flags(range !! 561         if (fault || write_fault)
289         required_fault =                       << 
290                 hmm_pte_need_fault(hmm_vma_wal << 
291         if (required_fault)                    << 
292                 goto fault;                       562                 goto fault;
293                                                   563 
294         /*                                     !! 564         *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags;
295          * Bypass devmap pte such as DAX page  << 
296          * flags(pfn_req_flags) are fulfilled. << 
297          * Since each architecture defines a s << 
298          * fall through and treat it like a no << 
299          */                                    << 
300         if (!vm_normal_page(walk->vma, addr, p << 
301             !pte_devmap(pte) &&                << 
302             !is_zero_pfn(pte_pfn(pte))) {      << 
303                 if (hmm_pte_need_fault(hmm_vma << 
304                         pte_unmap(ptep);       << 
305                         return -EFAULT;        << 
306                 }                              << 
307                 *hmm_pfn = HMM_PFN_ERROR;      << 
308                 return 0;                      << 
309         }                                      << 
310                                                << 
311         *hmm_pfn = pte_pfn(pte) | cpu_flags;   << 
312         return 0;                                 565         return 0;
313                                                   566 
314 fault:                                            567 fault:
315         pte_unmap(ptep);                          568         pte_unmap(ptep);
316         /* Fault any virtual address we were a    569         /* Fault any virtual address we were asked to fault */
317         return hmm_vma_fault(addr, end, requir !! 570         return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk);
318 }                                                 571 }
319                                                   572 
320 static int hmm_vma_walk_pmd(pmd_t *pmdp,          573 static int hmm_vma_walk_pmd(pmd_t *pmdp,
321                             unsigned long star    574                             unsigned long start,
322                             unsigned long end,    575                             unsigned long end,
323                             struct mm_walk *wa    576                             struct mm_walk *walk)
324 {                                                 577 {
325         struct hmm_vma_walk *hmm_vma_walk = wa    578         struct hmm_vma_walk *hmm_vma_walk = walk->private;
326         struct hmm_range *range = hmm_vma_walk    579         struct hmm_range *range = hmm_vma_walk->range;
327         unsigned long *hmm_pfns =              !! 580         uint64_t *pfns = range->pfns;
328                 &range->hmm_pfns[(start - rang !! 581         unsigned long addr = start, i;
329         unsigned long npages = (end - start) > << 
330         unsigned long addr = start;            << 
331         pte_t *ptep;                              582         pte_t *ptep;
332         pmd_t pmd;                             !! 583 
                                                   >> 584         i = (addr - range->start) >> PAGE_SHIFT;
333                                                   585 
334 again:                                            586 again:
335         pmd = pmdp_get_lockless(pmdp);         !! 587         if (pmd_none(*pmdp))
336         if (pmd_none(pmd))                     !! 588                 return hmm_vma_walk_hole(start, end, walk);
337                 return hmm_vma_walk_hole(start << 
338                                                   589 
339         if (thp_migration_supported() && is_pm !! 590         if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB))
340                 if (hmm_range_need_fault(hmm_v !! 591                 return hmm_pfns_bad(start, end, walk);
341                         hmm_vma_walk->last = a << 
342                         pmd_migration_entry_wa << 
343                         return -EBUSY;         << 
344                 }                              << 
345                 return hmm_pfns_fill(start, en << 
346         }                                      << 
347                                                   592 
348         if (!pmd_present(pmd)) {               !! 593         if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) {
349                 if (hmm_range_need_fault(hmm_v !! 594                 pmd_t pmd;
350                         return -EFAULT;        << 
351                 return hmm_pfns_fill(start, en << 
352         }                                      << 
353                                                   595 
354         if (pmd_devmap(pmd) || pmd_trans_huge( << 
355                 /*                                596                 /*
356                  * No need to take pmd_lock he !! 597                  * No need to take pmd_lock here, even if some other threads
357                  * is splitting the huge pmd w    598                  * is splitting the huge pmd we will get that event through
358                  * mmu_notifier callback.         599                  * mmu_notifier callback.
359                  *                                600                  *
360                  * So just read pmd value and  !! 601                  * So just read pmd value and check again its a transparent
361                  * huge or device mapping one     602                  * huge or device mapping one and compute corresponding pfn
362                  * values.                        603                  * values.
363                  */                               604                  */
364                 pmd = pmdp_get_lockless(pmdp); !! 605                 pmd = pmd_read_atomic(pmdp);
                                                   >> 606                 barrier();
365                 if (!pmd_devmap(pmd) && !pmd_t    607                 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
366                         goto again;               608                         goto again;
367                                                   609 
368                 return hmm_vma_handle_pmd(walk !! 610                 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd);
369         }                                         611         }
370                                                   612 
371         /*                                     !! 613         if (pmd_bad(*pmdp))
372          * We have handled all the valid cases !! 614                 return hmm_pfns_bad(start, end, walk);
373          * huge or transparent huge. At this p << 
374          * entry pointing to pte directory or  << 
375          * recover.                            << 
376          */                                    << 
377         if (pmd_bad(pmd)) {                    << 
378                 if (hmm_range_need_fault(hmm_v << 
379                         return -EFAULT;        << 
380                 return hmm_pfns_fill(start, en << 
381         }                                      << 
382                                                   615 
383         ptep = pte_offset_map(pmdp, addr);        616         ptep = pte_offset_map(pmdp, addr);
384         if (!ptep)                             !! 617         for (; addr < end; addr += PAGE_SIZE, ptep++, i++) {
385                 goto again;                    << 
386         for (; addr < end; addr += PAGE_SIZE,  << 
387                 int r;                            618                 int r;
388                                                   619 
389                 r = hmm_vma_handle_pte(walk, a !! 620                 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]);
390                 if (r) {                          621                 if (r) {
391                         /* hmm_vma_handle_pte( !! 622                         /* hmm_vma_handle_pte() did unmap pte directory */
                                                   >> 623                         hmm_vma_walk->last = addr;
392                         return r;                 624                         return r;
393                 }                                 625                 }
394         }                                         626         }
395         pte_unmap(ptep - 1);                      627         pte_unmap(ptep - 1);
                                                   >> 628 
                                                   >> 629         hmm_vma_walk->last = addr;
396         return 0;                                 630         return 0;
397 }                                                 631 }
398                                                   632 
399 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \   !! 633 static void hmm_pfns_clear(struct hmm_range *range,
400     defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEP !! 634                            uint64_t *pfns,
401 static inline unsigned long pud_to_hmm_pfn_fla !! 635                            unsigned long addr,
402                                                !! 636                            unsigned long end)
403 {                                                 637 {
404         if (!pud_present(pud))                 !! 638         for (; addr < end; addr += PAGE_SIZE, pfns++)
405                 return 0;                      !! 639                 *pfns = range->values[HMM_PFN_NONE];
406         return (pud_write(pud) ? (HMM_PFN_VALI << 
407                                  HMM_PFN_VALID << 
408                hmm_pfn_flags_order(PUD_SHIFT - << 
409 }                                                 640 }
410                                                   641 
411 static int hmm_vma_walk_pud(pud_t *pudp, unsig !! 642 static void hmm_pfns_special(struct hmm_range *range)
412                 struct mm_walk *walk)          << 
413 {                                                 643 {
414         struct hmm_vma_walk *hmm_vma_walk = wa !! 644         unsigned long addr = range->start, i = 0;
415         struct hmm_range *range = hmm_vma_walk << 
416         unsigned long addr = start;            << 
417         pud_t pud;                             << 
418         spinlock_t *ptl = pud_trans_huge_lock( << 
419                                                << 
420         if (!ptl)                              << 
421                 return 0;                      << 
422                                                   645 
423         /* Normally we don't want to split the !! 646         for (; addr < range->end; addr += PAGE_SIZE, i++)
424         walk->action = ACTION_CONTINUE;        !! 647                 range->pfns[i] = range->values[HMM_PFN_SPECIAL];
                                                   >> 648 }
425                                                   649 
426         pud = READ_ONCE(*pudp);                !! 650 /*
427         if (!pud_present(pud)) {               !! 651  * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses
428                 spin_unlock(ptl);              !! 652  * @range: range being snapshotted
429                 return hmm_vma_walk_hole(start !! 653  * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid
430         }                                      !! 654  *          vma permission, 0 success
431                                                !! 655  *
432         if (pud_leaf(pud) && pud_devmap(pud))  !! 656  * This snapshots the CPU page table for a range of virtual addresses. Snapshot
433                 unsigned long i, npages, pfn;  !! 657  * validity is tracked by range struct. See hmm_vma_range_done() for further
434                 unsigned int required_fault;   !! 658  * information.
435                 unsigned long *hmm_pfns;       !! 659  *
436                 unsigned long cpu_flags;       !! 660  * The range struct is initialized here. It tracks the CPU page table, but only
437                                                !! 661  * if the function returns success (0), in which case the caller must then call
438                 i = (addr - range->start) >> P !! 662  * hmm_vma_range_done() to stop CPU page table update tracking on this range.
439                 npages = (end - addr) >> PAGE_ !! 663  *
440                 hmm_pfns = &range->hmm_pfns[i] !! 664  * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS
441                                                !! 665  * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED !
442                 cpu_flags = pud_to_hmm_pfn_fla !! 666  */
443                 required_fault = hmm_range_nee !! 667 int hmm_vma_get_pfns(struct hmm_range *range)
444                                                !! 668 {
445                 if (required_fault) {          !! 669         struct vm_area_struct *vma = range->vma;
446                         spin_unlock(ptl);      !! 670         struct hmm_vma_walk hmm_vma_walk;
447                         return hmm_vma_fault(a !! 671         struct mm_walk mm_walk;
448                 }                              !! 672         struct hmm *hmm;
                                                   >> 673 
                                                   >> 674         /* Sanity check, this really should not happen ! */
                                                   >> 675         if (range->start < vma->vm_start || range->start >= vma->vm_end)
                                                   >> 676                 return -EINVAL;
                                                   >> 677         if (range->end < vma->vm_start || range->end > vma->vm_end)
                                                   >> 678                 return -EINVAL;
                                                   >> 679 
                                                   >> 680         hmm = hmm_register(vma->vm_mm);
                                                   >> 681         if (!hmm)
                                                   >> 682                 return -ENOMEM;
                                                   >> 683         /* Caller must have registered a mirror, via hmm_mirror_register() ! */
                                                   >> 684         if (!hmm->mmu_notifier.ops)
                                                   >> 685                 return -EINVAL;
                                                   >> 686 
                                                   >> 687         /* FIXME support hugetlb fs */
                                                   >> 688         if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
                                                   >> 689                 hmm_pfns_special(range);
                                                   >> 690                 return -EINVAL;
                                                   >> 691         }
449                                                   692 
450                 pfn = pud_pfn(pud) + ((addr &  !! 693         if (!(vma->vm_flags & VM_READ)) {
451                 for (i = 0; i < npages; ++i, + !! 694                 /*
452                         hmm_pfns[i] = pfn | cp !! 695                  * If vma do not allow read access, then assume that it does
453                 goto out_unlock;               !! 696                  * not allow write access, either. Architecture that allow
                                                   >> 697                  * write without read access are not supported by HMM, because
                                                   >> 698                  * operations such has atomic access would not work.
                                                   >> 699                  */
                                                   >> 700                 hmm_pfns_clear(range, range->pfns, range->start, range->end);
                                                   >> 701                 return -EPERM;
454         }                                         702         }
455                                                   703 
456         /* Ask for the PUD to be split */      !! 704         /* Initialize range to track CPU page table update */
457         walk->action = ACTION_SUBTREE;         !! 705         spin_lock(&hmm->lock);
                                                   >> 706         range->valid = true;
                                                   >> 707         list_add_rcu(&range->list, &hmm->ranges);
                                                   >> 708         spin_unlock(&hmm->lock);
                                                   >> 709 
                                                   >> 710         hmm_vma_walk.fault = false;
                                                   >> 711         hmm_vma_walk.range = range;
                                                   >> 712         mm_walk.private = &hmm_vma_walk;
                                                   >> 713 
                                                   >> 714         mm_walk.vma = vma;
                                                   >> 715         mm_walk.mm = vma->vm_mm;
                                                   >> 716         mm_walk.pte_entry = NULL;
                                                   >> 717         mm_walk.test_walk = NULL;
                                                   >> 718         mm_walk.hugetlb_entry = NULL;
                                                   >> 719         mm_walk.pmd_entry = hmm_vma_walk_pmd;
                                                   >> 720         mm_walk.pte_hole = hmm_vma_walk_hole;
458                                                   721 
459 out_unlock:                                    !! 722         walk_page_range(range->start, range->end, &mm_walk);
460         spin_unlock(ptl);                      << 
461         return 0;                                 723         return 0;
462 }                                                 724 }
463 #else                                          !! 725 EXPORT_SYMBOL(hmm_vma_get_pfns);
464 #define hmm_vma_walk_pud        NULL           !! 726 
465 #endif                                         !! 727 /*
466                                                !! 728  * hmm_vma_range_done() - stop tracking change to CPU page table over a range
467 #ifdef CONFIG_HUGETLB_PAGE                     !! 729  * @range: range being tracked
468 static int hmm_vma_walk_hugetlb_entry(pte_t *p !! 730  * Returns: false if range data has been invalidated, true otherwise
469                                       unsigned !! 731  *
470                                       struct m !! 732  * Range struct is used to track updates to the CPU page table after a call to
                                                   >> 733  * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done
                                                   >> 734  * using the data,  or wants to lock updates to the data it got from those
                                                   >> 735  * functions, it must call the hmm_vma_range_done() function, which will then
                                                   >> 736  * stop tracking CPU page table updates.
                                                   >> 737  *
                                                   >> 738  * Note that device driver must still implement general CPU page table update
                                                   >> 739  * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using
                                                   >> 740  * the mmu_notifier API directly.
                                                   >> 741  *
                                                   >> 742  * CPU page table update tracking done through hmm_range is only temporary and
                                                   >> 743  * to be used while trying to duplicate CPU page table contents for a range of
                                                   >> 744  * virtual addresses.
                                                   >> 745  *
                                                   >> 746  * There are two ways to use this :
                                                   >> 747  * again:
                                                   >> 748  *   hmm_vma_get_pfns(range); or hmm_vma_fault(...);
                                                   >> 749  *   trans = device_build_page_table_update_transaction(pfns);
                                                   >> 750  *   device_page_table_lock();
                                                   >> 751  *   if (!hmm_vma_range_done(range)) {
                                                   >> 752  *     device_page_table_unlock();
                                                   >> 753  *     goto again;
                                                   >> 754  *   }
                                                   >> 755  *   device_commit_transaction(trans);
                                                   >> 756  *   device_page_table_unlock();
                                                   >> 757  *
                                                   >> 758  * Or:
                                                   >> 759  *   hmm_vma_get_pfns(range); or hmm_vma_fault(...);
                                                   >> 760  *   device_page_table_lock();
                                                   >> 761  *   hmm_vma_range_done(range);
                                                   >> 762  *   device_update_page_table(range->pfns);
                                                   >> 763  *   device_page_table_unlock();
                                                   >> 764  */
                                                   >> 765 bool hmm_vma_range_done(struct hmm_range *range)
471 {                                                 766 {
472         unsigned long addr = start, i, pfn;    !! 767         unsigned long npages = (range->end - range->start) >> PAGE_SHIFT;
473         struct hmm_vma_walk *hmm_vma_walk = wa !! 768         struct hmm *hmm;
474         struct hmm_range *range = hmm_vma_walk !! 769 
475         struct vm_area_struct *vma = walk->vma !! 770         if (range->end <= range->start) {
476         unsigned int required_fault;           !! 771                 BUG();
477         unsigned long pfn_req_flags;           !! 772                 return false;
478         unsigned long cpu_flags;               !! 773         }
479         spinlock_t *ptl;                       !! 774 
480         pte_t entry;                           !! 775         hmm = hmm_register(range->vma->vm_mm);
481                                                !! 776         if (!hmm) {
482         ptl = huge_pte_lock(hstate_vma(vma), w !! 777                 memset(range->pfns, 0, sizeof(*range->pfns) * npages);
483         entry = huge_ptep_get(walk->mm, addr,  !! 778                 return false;
484                                                !! 779         }
485         i = (start - range->start) >> PAGE_SHI << 
486         pfn_req_flags = range->hmm_pfns[i];    << 
487         cpu_flags = pte_to_hmm_pfn_flags(range << 
488                     hmm_pfn_flags_order(huge_p << 
489         required_fault =                       << 
490                 hmm_pte_need_fault(hmm_vma_wal << 
491         if (required_fault) {                  << 
492                 int ret;                       << 
493                                                   780 
494                 spin_unlock(ptl);              !! 781         spin_lock(&hmm->lock);
495                 hugetlb_vma_unlock_read(vma);  !! 782         list_del_rcu(&range->list);
                                                   >> 783         spin_unlock(&hmm->lock);
                                                   >> 784 
                                                   >> 785         return range->valid;
                                                   >> 786 }
                                                   >> 787 EXPORT_SYMBOL(hmm_vma_range_done);
                                                   >> 788 
                                                   >> 789 /*
                                                   >> 790  * hmm_vma_fault() - try to fault some address in a virtual address range
                                                   >> 791  * @range: range being faulted
                                                   >> 792  * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem)
                                                   >> 793  * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop)
                                                   >> 794  *
                                                   >> 795  * This is similar to a regular CPU page fault except that it will not trigger
                                                   >> 796  * any memory migration if the memory being faulted is not accessible by CPUs.
                                                   >> 797  *
                                                   >> 798  * On error, for one virtual address in the range, the function will mark the
                                                   >> 799  * corresponding HMM pfn entry with an error flag.
                                                   >> 800  *
                                                   >> 801  * Expected use pattern:
                                                   >> 802  * retry:
                                                   >> 803  *   down_read(&mm->mmap_sem);
                                                   >> 804  *   // Find vma and address device wants to fault, initialize hmm_pfn_t
                                                   >> 805  *   // array accordingly
                                                   >> 806  *   ret = hmm_vma_fault(range, write, block);
                                                   >> 807  *   switch (ret) {
                                                   >> 808  *   case -EAGAIN:
                                                   >> 809  *     hmm_vma_range_done(range);
                                                   >> 810  *     // You might want to rate limit or yield to play nicely, you may
                                                   >> 811  *     // also commit any valid pfn in the array assuming that you are
                                                   >> 812  *     // getting true from hmm_vma_range_monitor_end()
                                                   >> 813  *     goto retry;
                                                   >> 814  *   case 0:
                                                   >> 815  *     break;
                                                   >> 816  *   case -ENOMEM:
                                                   >> 817  *   case -EINVAL:
                                                   >> 818  *   case -EPERM:
                                                   >> 819  *   default:
                                                   >> 820  *     // Handle error !
                                                   >> 821  *     up_read(&mm->mmap_sem)
                                                   >> 822  *     return;
                                                   >> 823  *   }
                                                   >> 824  *   // Take device driver lock that serialize device page table update
                                                   >> 825  *   driver_lock_device_page_table_update();
                                                   >> 826  *   hmm_vma_range_done(range);
                                                   >> 827  *   // Commit pfns we got from hmm_vma_fault()
                                                   >> 828  *   driver_unlock_device_page_table_update();
                                                   >> 829  *   up_read(&mm->mmap_sem)
                                                   >> 830  *
                                                   >> 831  * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0)
                                                   >> 832  * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION !
                                                   >> 833  *
                                                   >> 834  * YOU HAVE BEEN WARNED !
                                                   >> 835  */
                                                   >> 836 int hmm_vma_fault(struct hmm_range *range, bool block)
                                                   >> 837 {
                                                   >> 838         struct vm_area_struct *vma = range->vma;
                                                   >> 839         unsigned long start = range->start;
                                                   >> 840         struct hmm_vma_walk hmm_vma_walk;
                                                   >> 841         struct mm_walk mm_walk;
                                                   >> 842         struct hmm *hmm;
                                                   >> 843         int ret;
                                                   >> 844 
                                                   >> 845         /* Sanity check, this really should not happen ! */
                                                   >> 846         if (range->start < vma->vm_start || range->start >= vma->vm_end)
                                                   >> 847                 return -EINVAL;
                                                   >> 848         if (range->end < vma->vm_start || range->end > vma->vm_end)
                                                   >> 849                 return -EINVAL;
                                                   >> 850 
                                                   >> 851         hmm = hmm_register(vma->vm_mm);
                                                   >> 852         if (!hmm) {
                                                   >> 853                 hmm_pfns_clear(range, range->pfns, range->start, range->end);
                                                   >> 854                 return -ENOMEM;
                                                   >> 855         }
                                                   >> 856         /* Caller must have registered a mirror using hmm_mirror_register() */
                                                   >> 857         if (!hmm->mmu_notifier.ops)
                                                   >> 858                 return -EINVAL;
                                                   >> 859 
                                                   >> 860         /* FIXME support hugetlb fs */
                                                   >> 861         if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) {
                                                   >> 862                 hmm_pfns_special(range);
                                                   >> 863                 return -EINVAL;
                                                   >> 864         }
                                                   >> 865 
                                                   >> 866         if (!(vma->vm_flags & VM_READ)) {
496                 /*                                867                 /*
497                  * Avoid deadlock: drop the vm !! 868                  * If vma do not allow read access, then assume that it does
498                  * hmm_vma_fault(), which will !! 869                  * not allow write access, either. Architecture that allow
499                  * drop the vma lock. This is  !! 870                  * write without read access are not supported by HMM, because
500                  * protection point of view, b !! 871                  * operations such has atomic access would not work.
501                  * use here of either pte or p << 
502                  * lock.                       << 
503                  */                               872                  */
504                 ret = hmm_vma_fault(addr, end, !! 873                 hmm_pfns_clear(range, range->pfns, range->start, range->end);
505                 hugetlb_vma_lock_read(vma);    !! 874                 return -EPERM;
506                 return ret;                    << 
507         }                                         875         }
508                                                   876 
509         pfn = pte_pfn(entry) + ((start & ~hmas !! 877         /* Initialize range to track CPU page table update */
510         for (; addr < end; addr += PAGE_SIZE,  !! 878         spin_lock(&hmm->lock);
511                 range->hmm_pfns[i] = pfn | cpu !! 879         range->valid = true;
                                                   >> 880         list_add_rcu(&range->list, &hmm->ranges);
                                                   >> 881         spin_unlock(&hmm->lock);
                                                   >> 882 
                                                   >> 883         hmm_vma_walk.fault = true;
                                                   >> 884         hmm_vma_walk.block = block;
                                                   >> 885         hmm_vma_walk.range = range;
                                                   >> 886         mm_walk.private = &hmm_vma_walk;
                                                   >> 887         hmm_vma_walk.last = range->start;
                                                   >> 888 
                                                   >> 889         mm_walk.vma = vma;
                                                   >> 890         mm_walk.mm = vma->vm_mm;
                                                   >> 891         mm_walk.pte_entry = NULL;
                                                   >> 892         mm_walk.test_walk = NULL;
                                                   >> 893         mm_walk.hugetlb_entry = NULL;
                                                   >> 894         mm_walk.pmd_entry = hmm_vma_walk_pmd;
                                                   >> 895         mm_walk.pte_hole = hmm_vma_walk_hole;
512                                                   896 
513         spin_unlock(ptl);                      !! 897         do {
514         return 0;                              !! 898                 ret = walk_page_range(start, range->end, &mm_walk);
                                                   >> 899                 start = hmm_vma_walk.last;
                                                   >> 900         } while (ret == -EAGAIN);
                                                   >> 901 
                                                   >> 902         if (ret) {
                                                   >> 903                 unsigned long i;
                                                   >> 904 
                                                   >> 905                 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
                                                   >> 906                 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last,
                                                   >> 907                                range->end);
                                                   >> 908                 hmm_vma_range_done(range);
                                                   >> 909         }
                                                   >> 910         return ret;
515 }                                                 911 }
516 #else                                          !! 912 EXPORT_SYMBOL(hmm_vma_fault);
517 #define hmm_vma_walk_hugetlb_entry NULL        !! 913 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */
518 #endif /* CONFIG_HUGETLB_PAGE */               << 
519                                                   914 
520 static int hmm_vma_walk_test(unsigned long sta !! 915 
521                              struct mm_walk *w !! 916 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) ||  IS_ENABLED(CONFIG_DEVICE_PUBLIC)
                                                   >> 917 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma,
                                                   >> 918                                        unsigned long addr)
522 {                                                 919 {
523         struct hmm_vma_walk *hmm_vma_walk = wa !! 920         struct page *page;
524         struct hmm_range *range = hmm_vma_walk << 
525         struct vm_area_struct *vma = walk->vma << 
526                                                   921 
527         if (!(vma->vm_flags & (VM_IO | VM_PFNM !! 922         page = alloc_page_vma(GFP_HIGHUSER, vma, addr);
528             vma->vm_flags & VM_READ)           !! 923         if (!page)
529                 return 0;                      !! 924                 return NULL;
                                                   >> 925         lock_page(page);
                                                   >> 926         return page;
                                                   >> 927 }
                                                   >> 928 EXPORT_SYMBOL(hmm_vma_alloc_locked_page);
                                                   >> 929 
                                                   >> 930 
                                                   >> 931 static void hmm_devmem_ref_release(struct percpu_ref *ref)
                                                   >> 932 {
                                                   >> 933         struct hmm_devmem *devmem;
                                                   >> 934 
                                                   >> 935         devmem = container_of(ref, struct hmm_devmem, ref);
                                                   >> 936         complete(&devmem->completion);
                                                   >> 937 }
                                                   >> 938 
                                                   >> 939 static void hmm_devmem_ref_exit(void *data)
                                                   >> 940 {
                                                   >> 941         struct percpu_ref *ref = data;
                                                   >> 942         struct hmm_devmem *devmem;
                                                   >> 943 
                                                   >> 944         devmem = container_of(ref, struct hmm_devmem, ref);
                                                   >> 945         percpu_ref_exit(ref);
                                                   >> 946         devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data);
                                                   >> 947 }
                                                   >> 948 
                                                   >> 949 static void hmm_devmem_ref_kill(void *data)
                                                   >> 950 {
                                                   >> 951         struct percpu_ref *ref = data;
                                                   >> 952         struct hmm_devmem *devmem;
                                                   >> 953 
                                                   >> 954         devmem = container_of(ref, struct hmm_devmem, ref);
                                                   >> 955         percpu_ref_kill(ref);
                                                   >> 956         wait_for_completion(&devmem->completion);
                                                   >> 957         devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data);
                                                   >> 958 }
                                                   >> 959 
                                                   >> 960 static int hmm_devmem_fault(struct vm_area_struct *vma,
                                                   >> 961                             unsigned long addr,
                                                   >> 962                             const struct page *page,
                                                   >> 963                             unsigned int flags,
                                                   >> 964                             pmd_t *pmdp)
                                                   >> 965 {
                                                   >> 966         struct hmm_devmem *devmem = page->pgmap->data;
                                                   >> 967 
                                                   >> 968         return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp);
                                                   >> 969 }
                                                   >> 970 
                                                   >> 971 static void hmm_devmem_free(struct page *page, void *data)
                                                   >> 972 {
                                                   >> 973         struct hmm_devmem *devmem = data;
                                                   >> 974 
                                                   >> 975         devmem->ops->free(devmem, page);
                                                   >> 976 }
                                                   >> 977 
                                                   >> 978 static DEFINE_MUTEX(hmm_devmem_lock);
                                                   >> 979 static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL);
                                                   >> 980 
                                                   >> 981 static void hmm_devmem_radix_release(struct resource *resource)
                                                   >> 982 {
                                                   >> 983         resource_size_t key, align_start, align_size;
                                                   >> 984 
                                                   >> 985         align_start = resource->start & ~(PA_SECTION_SIZE - 1);
                                                   >> 986         align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE);
                                                   >> 987 
                                                   >> 988         mutex_lock(&hmm_devmem_lock);
                                                   >> 989         for (key = resource->start;
                                                   >> 990              key <= resource->end;
                                                   >> 991              key += PA_SECTION_SIZE)
                                                   >> 992                 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT);
                                                   >> 993         mutex_unlock(&hmm_devmem_lock);
                                                   >> 994 }
                                                   >> 995 
                                                   >> 996 static void hmm_devmem_release(struct device *dev, void *data)
                                                   >> 997 {
                                                   >> 998         struct hmm_devmem *devmem = data;
                                                   >> 999         struct resource *resource = devmem->resource;
                                                   >> 1000         unsigned long start_pfn, npages;
                                                   >> 1001         struct zone *zone;
                                                   >> 1002         struct page *page;
                                                   >> 1003 
                                                   >> 1004         if (percpu_ref_tryget_live(&devmem->ref)) {
                                                   >> 1005                 dev_WARN(dev, "%s: page mapping is still live!\n", __func__);
                                                   >> 1006                 percpu_ref_put(&devmem->ref);
                                                   >> 1007         }
                                                   >> 1008 
                                                   >> 1009         /* pages are dead and unused, undo the arch mapping */
                                                   >> 1010         start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT;
                                                   >> 1011         npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT;
                                                   >> 1012 
                                                   >> 1013         page = pfn_to_page(start_pfn);
                                                   >> 1014         zone = page_zone(page);
                                                   >> 1015 
                                                   >> 1016         mem_hotplug_begin();
                                                   >> 1017         if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY)
                                                   >> 1018                 __remove_pages(zone, start_pfn, npages, NULL);
                                                   >> 1019         else
                                                   >> 1020                 arch_remove_memory(start_pfn << PAGE_SHIFT,
                                                   >> 1021                                    npages << PAGE_SHIFT, NULL);
                                                   >> 1022         mem_hotplug_done();
                                                   >> 1023 
                                                   >> 1024         hmm_devmem_radix_release(resource);
                                                   >> 1025 }
                                                   >> 1026 
                                                   >> 1027 static int hmm_devmem_pages_create(struct hmm_devmem *devmem)
                                                   >> 1028 {
                                                   >> 1029         resource_size_t key, align_start, align_size, align_end;
                                                   >> 1030         struct device *device = devmem->device;
                                                   >> 1031         int ret, nid, is_ram;
                                                   >> 1032         unsigned long pfn;
                                                   >> 1033 
                                                   >> 1034         align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1);
                                                   >> 1035         align_size = ALIGN(devmem->resource->start +
                                                   >> 1036                            resource_size(devmem->resource),
                                                   >> 1037                            PA_SECTION_SIZE) - align_start;
                                                   >> 1038 
                                                   >> 1039         is_ram = region_intersects(align_start, align_size,
                                                   >> 1040                                    IORESOURCE_SYSTEM_RAM,
                                                   >> 1041                                    IORES_DESC_NONE);
                                                   >> 1042         if (is_ram == REGION_MIXED) {
                                                   >> 1043                 WARN_ONCE(1, "%s attempted on mixed region %pr\n",
                                                   >> 1044                                 __func__, devmem->resource);
                                                   >> 1045                 return -ENXIO;
                                                   >> 1046         }
                                                   >> 1047         if (is_ram == REGION_INTERSECTS)
                                                   >> 1048                 return -ENXIO;
530                                                   1049 
                                                   >> 1050         if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY)
                                                   >> 1051                 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC;
                                                   >> 1052         else
                                                   >> 1053                 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE;
                                                   >> 1054 
                                                   >> 1055         devmem->pagemap.res = *devmem->resource;
                                                   >> 1056         devmem->pagemap.page_fault = hmm_devmem_fault;
                                                   >> 1057         devmem->pagemap.page_free = hmm_devmem_free;
                                                   >> 1058         devmem->pagemap.dev = devmem->device;
                                                   >> 1059         devmem->pagemap.ref = &devmem->ref;
                                                   >> 1060         devmem->pagemap.data = devmem;
                                                   >> 1061 
                                                   >> 1062         mutex_lock(&hmm_devmem_lock);
                                                   >> 1063         align_end = align_start + align_size - 1;
                                                   >> 1064         for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) {
                                                   >> 1065                 struct hmm_devmem *dup;
                                                   >> 1066 
                                                   >> 1067                 dup = radix_tree_lookup(&hmm_devmem_radix,
                                                   >> 1068                                         key >> PA_SECTION_SHIFT);
                                                   >> 1069                 if (dup) {
                                                   >> 1070                         dev_err(device, "%s: collides with mapping for %s\n",
                                                   >> 1071                                 __func__, dev_name(dup->device));
                                                   >> 1072                         mutex_unlock(&hmm_devmem_lock);
                                                   >> 1073                         ret = -EBUSY;
                                                   >> 1074                         goto error;
                                                   >> 1075                 }
                                                   >> 1076                 ret = radix_tree_insert(&hmm_devmem_radix,
                                                   >> 1077                                         key >> PA_SECTION_SHIFT,
                                                   >> 1078                                         devmem);
                                                   >> 1079                 if (ret) {
                                                   >> 1080                         dev_err(device, "%s: failed: %d\n", __func__, ret);
                                                   >> 1081                         mutex_unlock(&hmm_devmem_lock);
                                                   >> 1082                         goto error_radix;
                                                   >> 1083                 }
                                                   >> 1084         }
                                                   >> 1085         mutex_unlock(&hmm_devmem_lock);
                                                   >> 1086 
                                                   >> 1087         nid = dev_to_node(device);
                                                   >> 1088         if (nid < 0)
                                                   >> 1089                 nid = numa_mem_id();
                                                   >> 1090 
                                                   >> 1091         mem_hotplug_begin();
531         /*                                        1092         /*
532          * vma ranges that don't have struct p !! 1093          * For device private memory we call add_pages() as we only need to
533          * devices directly cannot be handled  !! 1094          * allocate and initialize struct page for the device memory. More-
                                                   >> 1095          * over the device memory is un-accessible thus we do not want to
                                                   >> 1096          * create a linear mapping for the memory like arch_add_memory()
                                                   >> 1097          * would do.
534          *                                        1098          *
535          * If the vma does not allow read acce !! 1099          * For device public memory, which is accesible by the CPU, we do
536          * allow write access either. HMM does !! 1100          * want the linear mapping and thus use arch_add_memory().
537          * allow write without read.           !! 1101          */
                                                   >> 1102         if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC)
                                                   >> 1103                 ret = arch_add_memory(nid, align_start, align_size, NULL,
                                                   >> 1104                                 false);
                                                   >> 1105         else
                                                   >> 1106                 ret = add_pages(nid, align_start >> PAGE_SHIFT,
                                                   >> 1107                                 align_size >> PAGE_SHIFT, NULL, false);
                                                   >> 1108         if (ret) {
                                                   >> 1109                 mem_hotplug_done();
                                                   >> 1110                 goto error_add_memory;
                                                   >> 1111         }
                                                   >> 1112         move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE],
                                                   >> 1113                                 align_start >> PAGE_SHIFT,
                                                   >> 1114                                 align_size >> PAGE_SHIFT, NULL);
                                                   >> 1115         mem_hotplug_done();
                                                   >> 1116 
                                                   >> 1117         for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) {
                                                   >> 1118                 struct page *page = pfn_to_page(pfn);
                                                   >> 1119 
                                                   >> 1120                 page->pgmap = &devmem->pagemap;
                                                   >> 1121         }
                                                   >> 1122         return 0;
                                                   >> 1123 
                                                   >> 1124 error_add_memory:
                                                   >> 1125         untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
                                                   >> 1126 error_radix:
                                                   >> 1127         hmm_devmem_radix_release(devmem->resource);
                                                   >> 1128 error:
                                                   >> 1129         return ret;
                                                   >> 1130 }
                                                   >> 1131 
                                                   >> 1132 static int hmm_devmem_match(struct device *dev, void *data, void *match_data)
                                                   >> 1133 {
                                                   >> 1134         struct hmm_devmem *devmem = data;
                                                   >> 1135 
                                                   >> 1136         return devmem->resource == match_data;
                                                   >> 1137 }
                                                   >> 1138 
                                                   >> 1139 static void hmm_devmem_pages_remove(struct hmm_devmem *devmem)
                                                   >> 1140 {
                                                   >> 1141         devres_release(devmem->device, &hmm_devmem_release,
                                                   >> 1142                        &hmm_devmem_match, devmem->resource);
                                                   >> 1143 }
                                                   >> 1144 
                                                   >> 1145 /*
                                                   >> 1146  * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory
                                                   >> 1147  *
                                                   >> 1148  * @ops: memory event device driver callback (see struct hmm_devmem_ops)
                                                   >> 1149  * @device: device struct to bind the resource too
                                                   >> 1150  * @size: size in bytes of the device memory to add
                                                   >> 1151  * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise
                                                   >> 1152  *
                                                   >> 1153  * This function first finds an empty range of physical address big enough to
                                                   >> 1154  * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which
                                                   >> 1155  * in turn allocates struct pages. It does not do anything beyond that; all
                                                   >> 1156  * events affecting the memory will go through the various callbacks provided
                                                   >> 1157  * by hmm_devmem_ops struct.
                                                   >> 1158  *
                                                   >> 1159  * Device driver should call this function during device initialization and
                                                   >> 1160  * is then responsible of memory management. HMM only provides helpers.
                                                   >> 1161  */
                                                   >> 1162 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops,
                                                   >> 1163                                   struct device *device,
                                                   >> 1164                                   unsigned long size)
                                                   >> 1165 {
                                                   >> 1166         struct hmm_devmem *devmem;
                                                   >> 1167         resource_size_t addr;
                                                   >> 1168         int ret;
                                                   >> 1169 
                                                   >> 1170         static_branch_enable(&device_private_key);
                                                   >> 1171 
                                                   >> 1172         devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
                                                   >> 1173                                    GFP_KERNEL, dev_to_node(device));
                                                   >> 1174         if (!devmem)
                                                   >> 1175                 return ERR_PTR(-ENOMEM);
                                                   >> 1176 
                                                   >> 1177         init_completion(&devmem->completion);
                                                   >> 1178         devmem->pfn_first = -1UL;
                                                   >> 1179         devmem->pfn_last = -1UL;
                                                   >> 1180         devmem->resource = NULL;
                                                   >> 1181         devmem->device = device;
                                                   >> 1182         devmem->ops = ops;
                                                   >> 1183 
                                                   >> 1184         ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
                                                   >> 1185                               0, GFP_KERNEL);
                                                   >> 1186         if (ret)
                                                   >> 1187                 goto error_percpu_ref;
                                                   >> 1188 
                                                   >> 1189         ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
                                                   >> 1190         if (ret)
                                                   >> 1191                 goto error_devm_add_action;
                                                   >> 1192 
                                                   >> 1193         size = ALIGN(size, PA_SECTION_SIZE);
                                                   >> 1194         addr = min((unsigned long)iomem_resource.end,
                                                   >> 1195                    (1UL << MAX_PHYSMEM_BITS) - 1);
                                                   >> 1196         addr = addr - size + 1UL;
                                                   >> 1197 
                                                   >> 1198         /*
                                                   >> 1199          * FIXME add a new helper to quickly walk resource tree and find free
                                                   >> 1200          * range
538          *                                        1201          *
539          * If a fault is requested for an unsu !! 1202          * FIXME what about ioport_resource resource ?
540          * failure.                            << 
541          */                                       1203          */
542         if (hmm_range_need_fault(hmm_vma_walk, !! 1204         for (; addr > size && addr >= iomem_resource.start; addr -= size) {
543                                  range->hmm_pf !! 1205                 ret = region_intersects(addr, size, 0, IORES_DESC_NONE);
544                                          ((sta !! 1206                 if (ret != REGION_DISJOINT)
545                                  (end - start) !! 1207                         continue;
546                 return -EFAULT;                !! 1208 
                                                   >> 1209                 devmem->resource = devm_request_mem_region(device, addr, size,
                                                   >> 1210                                                            dev_name(device));
                                                   >> 1211                 if (!devmem->resource) {
                                                   >> 1212                         ret = -ENOMEM;
                                                   >> 1213                         goto error_no_resource;
                                                   >> 1214                 }
                                                   >> 1215                 break;
                                                   >> 1216         }
                                                   >> 1217         if (!devmem->resource) {
                                                   >> 1218                 ret = -ERANGE;
                                                   >> 1219                 goto error_no_resource;
                                                   >> 1220         }
                                                   >> 1221 
                                                   >> 1222         devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY;
                                                   >> 1223         devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
                                                   >> 1224         devmem->pfn_last = devmem->pfn_first +
                                                   >> 1225                            (resource_size(devmem->resource) >> PAGE_SHIFT);
                                                   >> 1226 
                                                   >> 1227         ret = hmm_devmem_pages_create(devmem);
                                                   >> 1228         if (ret)
                                                   >> 1229                 goto error_pages;
                                                   >> 1230 
                                                   >> 1231         devres_add(device, devmem);
                                                   >> 1232 
                                                   >> 1233         ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
                                                   >> 1234         if (ret) {
                                                   >> 1235                 hmm_devmem_remove(devmem);
                                                   >> 1236                 return ERR_PTR(ret);
                                                   >> 1237         }
547                                                   1238 
548         hmm_pfns_fill(start, end, range, HMM_P !! 1239         return devmem;
549                                                   1240 
550         /* Skip this vma and continue processi !! 1241 error_pages:
551         return 1;                              !! 1242         devm_release_mem_region(device, devmem->resource->start,
                                                   >> 1243                                 resource_size(devmem->resource));
                                                   >> 1244 error_no_resource:
                                                   >> 1245 error_devm_add_action:
                                                   >> 1246         hmm_devmem_ref_kill(&devmem->ref);
                                                   >> 1247         hmm_devmem_ref_exit(&devmem->ref);
                                                   >> 1248 error_percpu_ref:
                                                   >> 1249         devres_free(devmem);
                                                   >> 1250         return ERR_PTR(ret);
552 }                                                 1251 }
                                                   >> 1252 EXPORT_SYMBOL(hmm_devmem_add);
553                                                   1253 
554 static const struct mm_walk_ops hmm_walk_ops = !! 1254 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops,
555         .pud_entry      = hmm_vma_walk_pud,    !! 1255                                            struct device *device,
556         .pmd_entry      = hmm_vma_walk_pmd,    !! 1256                                            struct resource *res)
557         .pte_hole       = hmm_vma_walk_hole,   !! 1257 {
558         .hugetlb_entry  = hmm_vma_walk_hugetlb !! 1258         struct hmm_devmem *devmem;
559         .test_walk      = hmm_vma_walk_test,   !! 1259         int ret;
560         .walk_lock      = PGWALK_RDLOCK,       !! 1260 
561 };                                             !! 1261         if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY)
                                                   >> 1262                 return ERR_PTR(-EINVAL);
562                                                   1263 
563 /**                                            !! 1264         static_branch_enable(&device_private_key);
564  * hmm_range_fault - try to fault some address !! 1265 
565  * @range:      argument structure             !! 1266         devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem),
566  *                                             !! 1267                                    GFP_KERNEL, dev_to_node(device));
567  * Returns 0 on success or one of the followin !! 1268         if (!devmem)
568  *                                             !! 1269                 return ERR_PTR(-ENOMEM);
569  * -EINVAL:     Invalid arguments or mm or vir !! 1270 
570  *              (e.g., device file vma).       !! 1271         init_completion(&devmem->completion);
571  * -ENOMEM:     Out of memory.                 !! 1272         devmem->pfn_first = -1UL;
572  * -EPERM:      Invalid permission (e.g., aski !! 1273         devmem->pfn_last = -1UL;
573  *              only).                         !! 1274         devmem->resource = res;
574  * -EBUSY:      The range has been invalidated !! 1275         devmem->device = device;
575  *              the invalidation to finish.    !! 1276         devmem->ops = ops;
576  * -EFAULT:     A page was requested to be val !! 1277 
577  *              ie it has no backing VMA or it !! 1278         ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release,
578  *                                             !! 1279                               0, GFP_KERNEL);
579  * This is similar to get_user_pages(), except !! 1280         if (ret)
580  * without mutating them (ie causing faults).  !! 1281                 goto error_percpu_ref;
581  */                                            !! 1282 
582 int hmm_range_fault(struct hmm_range *range)   !! 1283         ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref);
583 {                                              !! 1284         if (ret)
584         struct hmm_vma_walk hmm_vma_walk = {   !! 1285                 goto error_devm_add_action;
585                 .range = range,                !! 1286 
586                 .last = range->start,          !! 1287 
587         };                                     !! 1288         devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT;
588         struct mm_struct *mm = range->notifier !! 1289         devmem->pfn_last = devmem->pfn_first +
                                                   >> 1290                            (resource_size(devmem->resource) >> PAGE_SHIFT);
                                                   >> 1291 
                                                   >> 1292         ret = hmm_devmem_pages_create(devmem);
                                                   >> 1293         if (ret)
                                                   >> 1294                 goto error_devm_add_action;
                                                   >> 1295 
                                                   >> 1296         devres_add(device, devmem);
                                                   >> 1297 
                                                   >> 1298         ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref);
                                                   >> 1299         if (ret) {
                                                   >> 1300                 hmm_devmem_remove(devmem);
                                                   >> 1301                 return ERR_PTR(ret);
                                                   >> 1302         }
                                                   >> 1303 
                                                   >> 1304         return devmem;
                                                   >> 1305 
                                                   >> 1306 error_devm_add_action:
                                                   >> 1307         hmm_devmem_ref_kill(&devmem->ref);
                                                   >> 1308         hmm_devmem_ref_exit(&devmem->ref);
                                                   >> 1309 error_percpu_ref:
                                                   >> 1310         devres_free(devmem);
                                                   >> 1311         return ERR_PTR(ret);
                                                   >> 1312 }
                                                   >> 1313 EXPORT_SYMBOL(hmm_devmem_add_resource);
                                                   >> 1314 
                                                   >> 1315 /*
                                                   >> 1316  * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE)
                                                   >> 1317  *
                                                   >> 1318  * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory
                                                   >> 1319  *
                                                   >> 1320  * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf
                                                   >> 1321  * of the device driver. It will free struct page and remove the resource that
                                                   >> 1322  * reserved the physical address range for this device memory.
                                                   >> 1323  */
                                                   >> 1324 void hmm_devmem_remove(struct hmm_devmem *devmem)
                                                   >> 1325 {
                                                   >> 1326         resource_size_t start, size;
                                                   >> 1327         struct device *device;
                                                   >> 1328         bool cdm = false;
                                                   >> 1329 
                                                   >> 1330         if (!devmem)
                                                   >> 1331                 return;
                                                   >> 1332 
                                                   >> 1333         device = devmem->device;
                                                   >> 1334         start = devmem->resource->start;
                                                   >> 1335         size = resource_size(devmem->resource);
                                                   >> 1336 
                                                   >> 1337         cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY;
                                                   >> 1338         hmm_devmem_ref_kill(&devmem->ref);
                                                   >> 1339         hmm_devmem_ref_exit(&devmem->ref);
                                                   >> 1340         hmm_devmem_pages_remove(devmem);
                                                   >> 1341 
                                                   >> 1342         if (!cdm)
                                                   >> 1343                 devm_release_mem_region(device, start, size);
                                                   >> 1344 }
                                                   >> 1345 EXPORT_SYMBOL(hmm_devmem_remove);
                                                   >> 1346 
                                                   >> 1347 /*
                                                   >> 1348  * A device driver that wants to handle multiple devices memory through a
                                                   >> 1349  * single fake device can use hmm_device to do so. This is purely a helper
                                                   >> 1350  * and it is not needed to make use of any HMM functionality.
                                                   >> 1351  */
                                                   >> 1352 #define HMM_DEVICE_MAX 256
                                                   >> 1353 
                                                   >> 1354 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX);
                                                   >> 1355 static DEFINE_SPINLOCK(hmm_device_lock);
                                                   >> 1356 static struct class *hmm_device_class;
                                                   >> 1357 static dev_t hmm_device_devt;
                                                   >> 1358 
                                                   >> 1359 static void hmm_device_release(struct device *device)
                                                   >> 1360 {
                                                   >> 1361         struct hmm_device *hmm_device;
                                                   >> 1362 
                                                   >> 1363         hmm_device = container_of(device, struct hmm_device, device);
                                                   >> 1364         spin_lock(&hmm_device_lock);
                                                   >> 1365         clear_bit(hmm_device->minor, hmm_device_mask);
                                                   >> 1366         spin_unlock(&hmm_device_lock);
                                                   >> 1367 
                                                   >> 1368         kfree(hmm_device);
                                                   >> 1369 }
                                                   >> 1370 
                                                   >> 1371 struct hmm_device *hmm_device_new(void *drvdata)
                                                   >> 1372 {
                                                   >> 1373         struct hmm_device *hmm_device;
                                                   >> 1374 
                                                   >> 1375         hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL);
                                                   >> 1376         if (!hmm_device)
                                                   >> 1377                 return ERR_PTR(-ENOMEM);
                                                   >> 1378 
                                                   >> 1379         spin_lock(&hmm_device_lock);
                                                   >> 1380         hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX);
                                                   >> 1381         if (hmm_device->minor >= HMM_DEVICE_MAX) {
                                                   >> 1382                 spin_unlock(&hmm_device_lock);
                                                   >> 1383                 kfree(hmm_device);
                                                   >> 1384                 return ERR_PTR(-EBUSY);
                                                   >> 1385         }
                                                   >> 1386         set_bit(hmm_device->minor, hmm_device_mask);
                                                   >> 1387         spin_unlock(&hmm_device_lock);
                                                   >> 1388 
                                                   >> 1389         dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor);
                                                   >> 1390         hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt),
                                                   >> 1391                                         hmm_device->minor);
                                                   >> 1392         hmm_device->device.release = hmm_device_release;
                                                   >> 1393         dev_set_drvdata(&hmm_device->device, drvdata);
                                                   >> 1394         hmm_device->device.class = hmm_device_class;
                                                   >> 1395         device_initialize(&hmm_device->device);
                                                   >> 1396 
                                                   >> 1397         return hmm_device;
                                                   >> 1398 }
                                                   >> 1399 EXPORT_SYMBOL(hmm_device_new);
                                                   >> 1400 
                                                   >> 1401 void hmm_device_put(struct hmm_device *hmm_device)
                                                   >> 1402 {
                                                   >> 1403         put_device(&hmm_device->device);
                                                   >> 1404 }
                                                   >> 1405 EXPORT_SYMBOL(hmm_device_put);
                                                   >> 1406 
                                                   >> 1407 static int __init hmm_init(void)
                                                   >> 1408 {
589         int ret;                                  1409         int ret;
590                                                   1410 
591         mmap_assert_locked(mm);                !! 1411         ret = alloc_chrdev_region(&hmm_device_devt, 0,
                                                   >> 1412                                   HMM_DEVICE_MAX,
                                                   >> 1413                                   "hmm_device");
                                                   >> 1414         if (ret)
                                                   >> 1415                 return ret;
592                                                   1416 
593         do {                                   !! 1417         hmm_device_class = class_create(THIS_MODULE, "hmm_device");
594                 /* If range is no longer valid !! 1418         if (IS_ERR(hmm_device_class)) {
595                 if (mmu_interval_check_retry(r !! 1419                 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX);
596                                              r !! 1420                 return PTR_ERR(hmm_device_class);
597                         return -EBUSY;         !! 1421         }
598                 ret = walk_page_range(mm, hmm_ !! 1422         return 0;
599                                       &hmm_wal << 
600                 /*                             << 
601                  * When -EBUSY is returned the << 
602                  * hmm_vma_walk.last set to an << 
603                  * in pfns. All entries < last << 
604                  * output, and all >= are stil << 
605                  */                            << 
606         } while (ret == -EBUSY);               << 
607         return ret;                            << 
608 }                                                 1423 }
609 EXPORT_SYMBOL(hmm_range_fault);                !! 1424 
                                                   >> 1425 device_initcall(hmm_init);
                                                   >> 1426 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */
610                                                   1427
~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.
TOMOYO Linux Cross Reference Linux/mm/hmm.c

Diff markup

Differences between /mm/hmm.c (Version linux-6.12-rc7) and /mm/hmm.c (Version linux-4.17.19)

TOMOYO Linux Cross Reference
Linux/mm/hmm.c