1 // SPDX-License-Identifier: GPL-2.0-or-later << 2 /* 1 /* 3 * Copyright 2013 Red Hat Inc. 2 * Copyright 2013 Red Hat Inc. 4 * 3 * 5 * Authors: Jérôme Glisse <jglisse@redhat.co !! 4 * This program is free software; you can redistribute it and/or modify >> 5 * it under the terms of the GNU General Public License as published by >> 6 * the Free Software Foundation; either version 2 of the License, or >> 7 * (at your option) any later version. >> 8 * >> 9 * This program is distributed in the hope that it will be useful, >> 10 * but WITHOUT ANY WARRANTY; without even the implied warranty of >> 11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> 12 * GNU General Public License for more details. >> 13 * >> 14 * Authors: Jérôme Glisse <jglisse@redhat.com> 6 */ 15 */ 7 /* 16 /* 8 * Refer to include/linux/hmm.h for informatio 17 * Refer to include/linux/hmm.h for information about heterogeneous memory 9 * management or HMM for short. 18 * management or HMM for short. 10 */ 19 */ 11 #include <linux/pagewalk.h> !! 20 #include <linux/mm.h> 12 #include <linux/hmm.h> 21 #include <linux/hmm.h> 13 #include <linux/init.h> 22 #include <linux/init.h> 14 #include <linux/rmap.h> 23 #include <linux/rmap.h> 15 #include <linux/swap.h> 24 #include <linux/swap.h> 16 #include <linux/slab.h> 25 #include <linux/slab.h> 17 #include <linux/sched.h> 26 #include <linux/sched.h> 18 #include <linux/mmzone.h> 27 #include <linux/mmzone.h> 19 #include <linux/pagemap.h> 28 #include <linux/pagemap.h> 20 #include <linux/swapops.h> 29 #include <linux/swapops.h> 21 #include <linux/hugetlb.h> 30 #include <linux/hugetlb.h> 22 #include <linux/memremap.h> 31 #include <linux/memremap.h> 23 #include <linux/sched/mm.h> << 24 #include <linux/jump_label.h> 32 #include <linux/jump_label.h> 25 #include <linux/dma-mapping.h> << 26 #include <linux/mmu_notifier.h> 33 #include <linux/mmu_notifier.h> 27 #include <linux/memory_hotplug.h> 34 #include <linux/memory_hotplug.h> 28 35 29 #include "internal.h" !! 36 #define PA_SECTION_SIZE (1UL << PA_SECTION_SHIFT) >> 37 >> 38 #if defined(CONFIG_DEVICE_PRIVATE) || defined(CONFIG_DEVICE_PUBLIC) >> 39 /* >> 40 * Device private memory see HMM (Documentation/vm/hmm.txt) or hmm.h >> 41 */ >> 42 DEFINE_STATIC_KEY_FALSE(device_private_key); >> 43 EXPORT_SYMBOL(device_private_key); >> 44 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ >> 45 >> 46 >> 47 #if IS_ENABLED(CONFIG_HMM_MIRROR) >> 48 static const struct mmu_notifier_ops hmm_mmu_notifier_ops; >> 49 >> 50 /* >> 51 * struct hmm - HMM per mm struct >> 52 * >> 53 * @mm: mm struct this HMM struct is bound to >> 54 * @lock: lock protecting ranges list >> 55 * @sequence: we track updates to the CPU page table with a sequence number >> 56 * @ranges: list of range being snapshotted >> 57 * @mirrors: list of mirrors for this mm >> 58 * @mmu_notifier: mmu notifier to track updates to CPU page table >> 59 * @mirrors_sem: read/write semaphore protecting the mirrors list >> 60 */ >> 61 struct hmm { >> 62 struct mm_struct *mm; >> 63 spinlock_t lock; >> 64 atomic_t sequence; >> 65 struct list_head ranges; >> 66 struct list_head mirrors; >> 67 struct mmu_notifier mmu_notifier; >> 68 struct rw_semaphore mirrors_sem; >> 69 }; >> 70 >> 71 /* >> 72 * hmm_register - register HMM against an mm (HMM internal) >> 73 * >> 74 * @mm: mm struct to attach to >> 75 * >> 76 * This is not intended to be used directly by device drivers. It allocates an >> 77 * HMM struct if mm does not have one, and initializes it. >> 78 */ >> 79 static struct hmm *hmm_register(struct mm_struct *mm) >> 80 { >> 81 struct hmm *hmm = READ_ONCE(mm->hmm); >> 82 bool cleanup = false; >> 83 >> 84 /* >> 85 * The hmm struct can only be freed once the mm_struct goes away, >> 86 * hence we should always have pre-allocated an new hmm struct >> 87 * above. >> 88 */ >> 89 if (hmm) >> 90 return hmm; >> 91 >> 92 hmm = kmalloc(sizeof(*hmm), GFP_KERNEL); >> 93 if (!hmm) >> 94 return NULL; >> 95 INIT_LIST_HEAD(&hmm->mirrors); >> 96 init_rwsem(&hmm->mirrors_sem); >> 97 atomic_set(&hmm->sequence, 0); >> 98 hmm->mmu_notifier.ops = NULL; >> 99 INIT_LIST_HEAD(&hmm->ranges); >> 100 spin_lock_init(&hmm->lock); >> 101 hmm->mm = mm; >> 102 >> 103 /* >> 104 * We should only get here if hold the mmap_sem in write mode ie on >> 105 * registration of first mirror through hmm_mirror_register() >> 106 */ >> 107 hmm->mmu_notifier.ops = &hmm_mmu_notifier_ops; >> 108 if (__mmu_notifier_register(&hmm->mmu_notifier, mm)) { >> 109 kfree(hmm); >> 110 return NULL; >> 111 } >> 112 >> 113 spin_lock(&mm->page_table_lock); >> 114 if (!mm->hmm) >> 115 mm->hmm = hmm; >> 116 else >> 117 cleanup = true; >> 118 spin_unlock(&mm->page_table_lock); >> 119 >> 120 if (cleanup) { >> 121 mmu_notifier_unregister(&hmm->mmu_notifier, mm); >> 122 kfree(hmm); >> 123 } >> 124 >> 125 return mm->hmm; >> 126 } >> 127 >> 128 void hmm_mm_destroy(struct mm_struct *mm) >> 129 { >> 130 kfree(mm->hmm); >> 131 } >> 132 >> 133 static void hmm_invalidate_range(struct hmm *hmm, >> 134 enum hmm_update_type action, >> 135 unsigned long start, >> 136 unsigned long end) >> 137 { >> 138 struct hmm_mirror *mirror; >> 139 struct hmm_range *range; >> 140 >> 141 spin_lock(&hmm->lock); >> 142 list_for_each_entry(range, &hmm->ranges, list) { >> 143 unsigned long addr, idx, npages; >> 144 >> 145 if (end < range->start || start >= range->end) >> 146 continue; >> 147 >> 148 range->valid = false; >> 149 addr = max(start, range->start); >> 150 idx = (addr - range->start) >> PAGE_SHIFT; >> 151 npages = (min(range->end, end) - addr) >> PAGE_SHIFT; >> 152 memset(&range->pfns[idx], 0, sizeof(*range->pfns) * npages); >> 153 } >> 154 spin_unlock(&hmm->lock); >> 155 >> 156 down_read(&hmm->mirrors_sem); >> 157 list_for_each_entry(mirror, &hmm->mirrors, list) >> 158 mirror->ops->sync_cpu_device_pagetables(mirror, action, >> 159 start, end); >> 160 up_read(&hmm->mirrors_sem); >> 161 } >> 162 >> 163 static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) >> 164 { >> 165 struct hmm_mirror *mirror; >> 166 struct hmm *hmm = mm->hmm; >> 167 >> 168 down_write(&hmm->mirrors_sem); >> 169 mirror = list_first_entry_or_null(&hmm->mirrors, struct hmm_mirror, >> 170 list); >> 171 while (mirror) { >> 172 list_del_init(&mirror->list); >> 173 if (mirror->ops->release) { >> 174 /* >> 175 * Drop mirrors_sem so callback can wait on any pending >> 176 * work that might itself trigger mmu_notifier callback >> 177 * and thus would deadlock with us. >> 178 */ >> 179 up_write(&hmm->mirrors_sem); >> 180 mirror->ops->release(mirror); >> 181 down_write(&hmm->mirrors_sem); >> 182 } >> 183 mirror = list_first_entry_or_null(&hmm->mirrors, >> 184 struct hmm_mirror, list); >> 185 } >> 186 up_write(&hmm->mirrors_sem); >> 187 } >> 188 >> 189 static void hmm_invalidate_range_start(struct mmu_notifier *mn, >> 190 struct mm_struct *mm, >> 191 unsigned long start, >> 192 unsigned long end) >> 193 { >> 194 struct hmm *hmm = mm->hmm; >> 195 >> 196 VM_BUG_ON(!hmm); >> 197 >> 198 atomic_inc(&hmm->sequence); >> 199 } >> 200 >> 201 static void hmm_invalidate_range_end(struct mmu_notifier *mn, >> 202 struct mm_struct *mm, >> 203 unsigned long start, >> 204 unsigned long end) >> 205 { >> 206 struct hmm *hmm = mm->hmm; >> 207 >> 208 VM_BUG_ON(!hmm); >> 209 >> 210 hmm_invalidate_range(mm->hmm, HMM_UPDATE_INVALIDATE, start, end); >> 211 } >> 212 >> 213 static const struct mmu_notifier_ops hmm_mmu_notifier_ops = { >> 214 .release = hmm_release, >> 215 .invalidate_range_start = hmm_invalidate_range_start, >> 216 .invalidate_range_end = hmm_invalidate_range_end, >> 217 }; >> 218 >> 219 /* >> 220 * hmm_mirror_register() - register a mirror against an mm >> 221 * >> 222 * @mirror: new mirror struct to register >> 223 * @mm: mm to register against >> 224 * >> 225 * To start mirroring a process address space, the device driver must register >> 226 * an HMM mirror struct. >> 227 * >> 228 * THE mm->mmap_sem MUST BE HELD IN WRITE MODE ! >> 229 */ >> 230 int hmm_mirror_register(struct hmm_mirror *mirror, struct mm_struct *mm) >> 231 { >> 232 /* Sanity check */ >> 233 if (!mm || !mirror || !mirror->ops) >> 234 return -EINVAL; >> 235 >> 236 again: >> 237 mirror->hmm = hmm_register(mm); >> 238 if (!mirror->hmm) >> 239 return -ENOMEM; >> 240 >> 241 down_write(&mirror->hmm->mirrors_sem); >> 242 if (mirror->hmm->mm == NULL) { >> 243 /* >> 244 * A racing hmm_mirror_unregister() is about to destroy the hmm >> 245 * struct. Try again to allocate a new one. >> 246 */ >> 247 up_write(&mirror->hmm->mirrors_sem); >> 248 mirror->hmm = NULL; >> 249 goto again; >> 250 } else { >> 251 list_add(&mirror->list, &mirror->hmm->mirrors); >> 252 up_write(&mirror->hmm->mirrors_sem); >> 253 } >> 254 >> 255 return 0; >> 256 } >> 257 EXPORT_SYMBOL(hmm_mirror_register); >> 258 >> 259 /* >> 260 * hmm_mirror_unregister() - unregister a mirror >> 261 * >> 262 * @mirror: new mirror struct to register >> 263 * >> 264 * Stop mirroring a process address space, and cleanup. >> 265 */ >> 266 void hmm_mirror_unregister(struct hmm_mirror *mirror) >> 267 { >> 268 bool should_unregister = false; >> 269 struct mm_struct *mm; >> 270 struct hmm *hmm; >> 271 >> 272 if (mirror->hmm == NULL) >> 273 return; >> 274 >> 275 hmm = mirror->hmm; >> 276 down_write(&hmm->mirrors_sem); >> 277 list_del_init(&mirror->list); >> 278 should_unregister = list_empty(&hmm->mirrors); >> 279 mirror->hmm = NULL; >> 280 mm = hmm->mm; >> 281 hmm->mm = NULL; >> 282 up_write(&hmm->mirrors_sem); >> 283 >> 284 if (!should_unregister || mm == NULL) >> 285 return; >> 286 >> 287 spin_lock(&mm->page_table_lock); >> 288 if (mm->hmm == hmm) >> 289 mm->hmm = NULL; >> 290 spin_unlock(&mm->page_table_lock); >> 291 >> 292 mmu_notifier_unregister_no_release(&hmm->mmu_notifier, mm); >> 293 kfree(hmm); >> 294 } >> 295 EXPORT_SYMBOL(hmm_mirror_unregister); 30 296 31 struct hmm_vma_walk { 297 struct hmm_vma_walk { 32 struct hmm_range *range; 298 struct hmm_range *range; 33 unsigned long last; 299 unsigned long last; >> 300 bool fault; >> 301 bool block; 34 }; 302 }; 35 303 36 enum { !! 304 static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, 37 HMM_NEED_FAULT = 1 << 0, !! 305 bool write_fault, uint64_t *pfn) 38 HMM_NEED_WRITE_FAULT = 1 << 1, !! 306 { 39 HMM_NEED_ALL_BITS = HMM_NEED_FAULT | H !! 307 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; 40 }; !! 308 struct hmm_vma_walk *hmm_vma_walk = walk->private; >> 309 struct hmm_range *range = hmm_vma_walk->range; >> 310 struct vm_area_struct *vma = walk->vma; >> 311 int r; >> 312 >> 313 flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; >> 314 flags |= write_fault ? FAULT_FLAG_WRITE : 0; >> 315 r = handle_mm_fault(vma, addr, flags); >> 316 if (r & VM_FAULT_RETRY) >> 317 return -EBUSY; >> 318 if (r & VM_FAULT_ERROR) { >> 319 *pfn = range->values[HMM_PFN_ERROR]; >> 320 return -EFAULT; >> 321 } >> 322 >> 323 return -EAGAIN; >> 324 } 41 325 42 static int hmm_pfns_fill(unsigned long addr, u !! 326 static int hmm_pfns_bad(unsigned long addr, 43 struct hmm_range *ran !! 327 unsigned long end, >> 328 struct mm_walk *walk) 44 { 329 { 45 unsigned long i = (addr - range->start !! 330 struct hmm_vma_walk *hmm_vma_walk = walk->private; >> 331 struct hmm_range *range = hmm_vma_walk->range; >> 332 uint64_t *pfns = range->pfns; >> 333 unsigned long i; 46 334 >> 335 i = (addr - range->start) >> PAGE_SHIFT; 47 for (; addr < end; addr += PAGE_SIZE, 336 for (; addr < end; addr += PAGE_SIZE, i++) 48 range->hmm_pfns[i] = cpu_flags !! 337 pfns[i] = range->values[HMM_PFN_ERROR]; >> 338 49 return 0; 339 return 0; 50 } 340 } 51 341 52 /* 342 /* 53 * hmm_vma_fault() - fault in a range lacking !! 343 * hmm_vma_walk_hole() - handle a range lacking valid pmd or pte(s) 54 * @addr: range virtual start address (inclusi !! 344 * @start: range virtual start address (inclusive) 55 * @end: range virtual end address (exclusive) 345 * @end: range virtual end address (exclusive) 56 * @required_fault: HMM_NEED_* flags !! 346 * @fault: should we fault or not ? >> 347 * @write_fault: write fault ? 57 * @walk: mm_walk structure 348 * @walk: mm_walk structure 58 * Return: -EBUSY after page fault, or page fa !! 349 * Returns: 0 on success, -EAGAIN after page fault, or page fault error 59 * 350 * 60 * This function will be called whenever pmd_n 351 * This function will be called whenever pmd_none() or pte_none() returns true, 61 * or whenever there is no page directory cove 352 * or whenever there is no page directory covering the virtual address range. 62 */ 353 */ 63 static int hmm_vma_fault(unsigned long addr, u !! 354 static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, 64 unsigned int required !! 355 bool fault, bool write_fault, >> 356 struct mm_walk *walk) 65 { 357 { 66 struct hmm_vma_walk *hmm_vma_walk = wa 358 struct hmm_vma_walk *hmm_vma_walk = walk->private; 67 struct vm_area_struct *vma = walk->vma !! 359 struct hmm_range *range = hmm_vma_walk->range; 68 unsigned int fault_flags = FAULT_FLAG_ !! 360 uint64_t *pfns = range->pfns; >> 361 unsigned long i; 69 362 70 WARN_ON_ONCE(!required_fault); << 71 hmm_vma_walk->last = addr; 363 hmm_vma_walk->last = addr; >> 364 i = (addr - range->start) >> PAGE_SHIFT; >> 365 for (; addr < end; addr += PAGE_SIZE, i++) { >> 366 pfns[i] = range->values[HMM_PFN_NONE]; >> 367 if (fault || write_fault) { >> 368 int ret; >> 369 >> 370 ret = hmm_vma_do_fault(walk, addr, write_fault, >> 371 &pfns[i]); >> 372 if (ret != -EAGAIN) >> 373 return ret; >> 374 } >> 375 } 72 376 73 if (required_fault & HMM_NEED_WRITE_FA !! 377 return (fault || write_fault) ? -EAGAIN : 0; 74 if (!(vma->vm_flags & VM_WRITE !! 378 } 75 return -EPERM; !! 379 76 fault_flags |= FAULT_FLAG_WRIT !! 380 static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 77 } !! 381 uint64_t pfns, uint64_t cpu_flags, 78 !! 382 bool *fault, bool *write_fault) 79 for (; addr < end; addr += PAGE_SIZE) << 80 if (handle_mm_fault(vma, addr, << 81 VM_FAULT_ERROR) << 82 return -EFAULT; << 83 return -EBUSY; << 84 } << 85 << 86 static unsigned int hmm_pte_need_fault(const s << 87 unsigne << 88 unsigne << 89 { 383 { 90 struct hmm_range *range = hmm_vma_walk 384 struct hmm_range *range = hmm_vma_walk->range; 91 385 92 /* !! 386 *fault = *write_fault = false; 93 * So we not only consider the individ !! 387 if (!hmm_vma_walk->fault) 94 * consider the default flags requeste !! 388 return; 95 * be used 2 ways. The first one where << 96 * multiple page faults into one reque << 97 * those faults. The second one where << 98 * fault a range with specific flags. << 99 * waste to have the user pre-fill the << 100 * flags value. << 101 */ << 102 pfn_req_flags &= range->pfn_flags_mask << 103 pfn_req_flags |= range->default_flags; << 104 389 105 /* We aren't ask to do anything ... */ 390 /* We aren't ask to do anything ... */ 106 if (!(pfn_req_flags & HMM_PFN_REQ_FAUL !! 391 if (!(pfns & range->flags[HMM_PFN_VALID])) 107 return 0; !! 392 return; 108 !! 393 /* If this is device memory than only fault if explicitly requested */ 109 /* Need to write fault ? */ !! 394 if ((cpu_flags & range->flags[HMM_PFN_DEVICE_PRIVATE])) { 110 if ((pfn_req_flags & HMM_PFN_REQ_WRITE !! 395 /* Do we fault on device memory ? */ 111 !(cpu_flags & HMM_PFN_WRITE)) !! 396 if (pfns & range->flags[HMM_PFN_DEVICE_PRIVATE]) { 112 return HMM_NEED_FAULT | HMM_NE !! 397 *write_fault = pfns & range->flags[HMM_PFN_WRITE]; >> 398 *fault = true; >> 399 } >> 400 return; >> 401 } 113 402 114 /* If CPU page table is not valid then 403 /* If CPU page table is not valid then we need to fault */ 115 if (!(cpu_flags & HMM_PFN_VALID)) !! 404 *fault = !(cpu_flags & range->flags[HMM_PFN_VALID]); 116 return HMM_NEED_FAULT; !! 405 /* Need to write fault ? */ 117 return 0; !! 406 if ((pfns & range->flags[HMM_PFN_WRITE]) && >> 407 !(cpu_flags & range->flags[HMM_PFN_WRITE])) { >> 408 *write_fault = true; >> 409 *fault = true; >> 410 } 118 } 411 } 119 412 120 static unsigned int !! 413 static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, 121 hmm_range_need_fault(const struct hmm_vma_walk !! 414 const uint64_t *pfns, unsigned long npages, 122 const unsigned long hmm_p !! 415 uint64_t cpu_flags, bool *fault, 123 unsigned long cpu_flags) !! 416 bool *write_fault) 124 { 417 { 125 struct hmm_range *range = hmm_vma_walk << 126 unsigned int required_fault = 0; << 127 unsigned long i; 418 unsigned long i; 128 419 129 /* !! 420 if (!hmm_vma_walk->fault) { 130 * If the default flags do not request !! 421 *fault = *write_fault = false; 131 * not allow for individual pages to b !! 422 return; 132 * hmm_pte_need_fault() will always re !! 423 } 133 */ << 134 if (!((range->default_flags | range->p << 135 HMM_PFN_REQ_FAULT)) << 136 return 0; << 137 424 138 for (i = 0; i < npages; ++i) { 425 for (i = 0; i < npages; ++i) { 139 required_fault |= hmm_pte_need !! 426 hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, 140 !! 427 fault, write_fault); 141 if (required_fault == HMM_NEED !! 428 if ((*fault) || (*write_fault)) 142 return required_fault; !! 429 return; 143 } 430 } 144 return required_fault; << 145 } 431 } 146 432 147 static int hmm_vma_walk_hole(unsigned long add 433 static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, 148 __always_unused i !! 434 struct mm_walk *walk) 149 { 435 { 150 struct hmm_vma_walk *hmm_vma_walk = wa 436 struct hmm_vma_walk *hmm_vma_walk = walk->private; 151 struct hmm_range *range = hmm_vma_walk 437 struct hmm_range *range = hmm_vma_walk->range; 152 unsigned int required_fault; !! 438 bool fault, write_fault; 153 unsigned long i, npages; 439 unsigned long i, npages; 154 unsigned long *hmm_pfns; !! 440 uint64_t *pfns; 155 441 156 i = (addr - range->start) >> PAGE_SHIF 442 i = (addr - range->start) >> PAGE_SHIFT; 157 npages = (end - addr) >> PAGE_SHIFT; 443 npages = (end - addr) >> PAGE_SHIFT; 158 hmm_pfns = &range->hmm_pfns[i]; !! 444 pfns = &range->pfns[i]; 159 required_fault = !! 445 hmm_range_need_fault(hmm_vma_walk, pfns, npages, 160 hmm_range_need_fault(hmm_vma_w !! 446 0, &fault, &write_fault); 161 if (!walk->vma) { !! 447 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 162 if (required_fault) << 163 return -EFAULT; << 164 return hmm_pfns_fill(addr, end << 165 } << 166 if (required_fault) << 167 return hmm_vma_fault(addr, end << 168 return hmm_pfns_fill(addr, end, range, << 169 } << 170 << 171 static inline unsigned long hmm_pfn_flags_orde << 172 { << 173 return order << HMM_PFN_ORDER_SHIFT; << 174 } 448 } 175 449 176 static inline unsigned long pmd_to_hmm_pfn_fla !! 450 static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd) 177 << 178 { 451 { 179 if (pmd_protnone(pmd)) 452 if (pmd_protnone(pmd)) 180 return 0; 453 return 0; 181 return (pmd_write(pmd) ? (HMM_PFN_VALI !! 454 return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] | 182 HMM_PFN_VALID !! 455 range->flags[HMM_PFN_WRITE] : 183 hmm_pfn_flags_order(PMD_SHIFT - !! 456 range->flags[HMM_PFN_VALID]; 184 } 457 } 185 458 186 #ifdef CONFIG_TRANSPARENT_HUGEPAGE !! 459 static int hmm_vma_handle_pmd(struct mm_walk *walk, 187 static int hmm_vma_handle_pmd(struct mm_walk * !! 460 unsigned long addr, 188 unsigned long en !! 461 unsigned long end, >> 462 uint64_t *pfns, 189 pmd_t pmd) 463 pmd_t pmd) 190 { 464 { 191 struct hmm_vma_walk *hmm_vma_walk = wa 465 struct hmm_vma_walk *hmm_vma_walk = walk->private; 192 struct hmm_range *range = hmm_vma_walk 466 struct hmm_range *range = hmm_vma_walk->range; 193 unsigned long pfn, npages, i; 467 unsigned long pfn, npages, i; 194 unsigned int required_fault; !! 468 bool fault, write_fault; 195 unsigned long cpu_flags; !! 469 uint64_t cpu_flags; 196 470 197 npages = (end - addr) >> PAGE_SHIFT; 471 npages = (end - addr) >> PAGE_SHIFT; 198 cpu_flags = pmd_to_hmm_pfn_flags(range 472 cpu_flags = pmd_to_hmm_pfn_flags(range, pmd); 199 required_fault = !! 473 hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, 200 hmm_range_need_fault(hmm_vma_w !! 474 &fault, &write_fault); 201 if (required_fault) !! 475 202 return hmm_vma_fault(addr, end !! 476 if (pmd_protnone(pmd) || fault || write_fault) >> 477 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 203 478 204 pfn = pmd_pfn(pmd) + ((addr & ~PMD_MAS !! 479 pfn = pmd_pfn(pmd) + pte_index(addr); 205 for (i = 0; addr < end; addr += PAGE_S 480 for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++) 206 hmm_pfns[i] = pfn | cpu_flags; !! 481 pfns[i] = hmm_pfn_from_pfn(range, pfn) | cpu_flags; >> 482 hmm_vma_walk->last = end; 207 return 0; 483 return 0; 208 } 484 } 209 #else /* CONFIG_TRANSPARENT_HUGEPAGE */ << 210 /* stub to allow the code below to compile */ << 211 int hmm_vma_handle_pmd(struct mm_walk *walk, u << 212 unsigned long end, unsigned lo << 213 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ << 214 485 215 static inline unsigned long pte_to_hmm_pfn_fla !! 486 static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte) 216 << 217 { 487 { 218 if (pte_none(pte) || !pte_present(pte) !! 488 if (pte_none(pte) || !pte_present(pte)) 219 return 0; 489 return 0; 220 return pte_write(pte) ? (HMM_PFN_VALID !! 490 return pte_write(pte) ? range->flags[HMM_PFN_VALID] | >> 491 range->flags[HMM_PFN_WRITE] : >> 492 range->flags[HMM_PFN_VALID]; 221 } 493 } 222 494 223 static int hmm_vma_handle_pte(struct mm_walk * 495 static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, 224 unsigned long en 496 unsigned long end, pmd_t *pmdp, pte_t *ptep, 225 unsigned long *h !! 497 uint64_t *pfn) 226 { 498 { 227 struct hmm_vma_walk *hmm_vma_walk = wa 499 struct hmm_vma_walk *hmm_vma_walk = walk->private; 228 struct hmm_range *range = hmm_vma_walk 500 struct hmm_range *range = hmm_vma_walk->range; 229 unsigned int required_fault; !! 501 struct vm_area_struct *vma = walk->vma; 230 unsigned long cpu_flags; !! 502 bool fault, write_fault; 231 pte_t pte = ptep_get(ptep); !! 503 uint64_t cpu_flags; 232 uint64_t pfn_req_flags = *hmm_pfn; !! 504 pte_t pte = *ptep; 233 !! 505 uint64_t orig_pfn = *pfn; 234 if (pte_none_mostly(pte)) { !! 506 235 required_fault = !! 507 *pfn = range->values[HMM_PFN_NONE]; 236 hmm_pte_need_fault(hmm !! 508 cpu_flags = pte_to_hmm_pfn_flags(range, pte); 237 if (required_fault) !! 509 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, >> 510 &fault, &write_fault); >> 511 >> 512 if (pte_none(pte)) { >> 513 if (fault || write_fault) 238 goto fault; 514 goto fault; 239 *hmm_pfn = 0; << 240 return 0; 515 return 0; 241 } 516 } 242 517 243 if (!pte_present(pte)) { 518 if (!pte_present(pte)) { 244 swp_entry_t entry = pte_to_swp 519 swp_entry_t entry = pte_to_swp_entry(pte); 245 520 246 /* !! 521 if (!non_swap_entry(entry)) { 247 * Don't fault in device priva !! 522 if (fault || write_fault) 248 * just report the PFN. !! 523 goto fault; 249 */ << 250 if (is_device_private_entry(en << 251 pfn_swap_entry_to_page(ent << 252 range->dev_private_owner) << 253 cpu_flags = HMM_PFN_VA << 254 if (is_writable_device << 255 cpu_flags |= H << 256 *hmm_pfn = swp_offset_ << 257 return 0; 524 return 0; 258 } 525 } 259 526 260 required_fault = !! 527 /* 261 hmm_pte_need_fault(hmm !! 528 * This is a special swap entry, ignore migration, use 262 if (!required_fault) { !! 529 * device and report anything else as error. 263 *hmm_pfn = 0; !! 530 */ >> 531 if (is_device_private_entry(entry)) { >> 532 cpu_flags = range->flags[HMM_PFN_VALID] | >> 533 range->flags[HMM_PFN_DEVICE_PRIVATE]; >> 534 cpu_flags |= is_write_device_private_entry(entry) ? >> 535 range->flags[HMM_PFN_WRITE] : 0; >> 536 hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags, >> 537 &fault, &write_fault); >> 538 if (fault || write_fault) >> 539 goto fault; >> 540 *pfn = hmm_pfn_from_pfn(range, swp_offset(entry)); >> 541 *pfn |= cpu_flags; 264 return 0; 542 return 0; 265 } 543 } 266 544 267 if (!non_swap_entry(entry)) << 268 goto fault; << 269 << 270 if (is_device_private_entry(en << 271 goto fault; << 272 << 273 if (is_device_exclusive_entry( << 274 goto fault; << 275 << 276 if (is_migration_entry(entry)) 545 if (is_migration_entry(entry)) { 277 pte_unmap(ptep); !! 546 if (fault || write_fault) { 278 hmm_vma_walk->last = a !! 547 pte_unmap(ptep); 279 migration_entry_wait(w !! 548 hmm_vma_walk->last = addr; 280 return -EBUSY; !! 549 migration_entry_wait(vma->vm_mm, >> 550 pmdp, addr); >> 551 return -EAGAIN; >> 552 } >> 553 return 0; 281 } 554 } 282 555 283 /* Report error for everything 556 /* Report error for everything else */ 284 pte_unmap(ptep); !! 557 *pfn = range->values[HMM_PFN_ERROR]; 285 return -EFAULT; 558 return -EFAULT; 286 } 559 } 287 560 288 cpu_flags = pte_to_hmm_pfn_flags(range !! 561 if (fault || write_fault) 289 required_fault = << 290 hmm_pte_need_fault(hmm_vma_wal << 291 if (required_fault) << 292 goto fault; 562 goto fault; 293 563 294 /* !! 564 *pfn = hmm_pfn_from_pfn(range, pte_pfn(pte)) | cpu_flags; 295 * Bypass devmap pte such as DAX page << 296 * flags(pfn_req_flags) are fulfilled. << 297 * Since each architecture defines a s << 298 * fall through and treat it like a no << 299 */ << 300 if (!vm_normal_page(walk->vma, addr, p << 301 !pte_devmap(pte) && << 302 !is_zero_pfn(pte_pfn(pte))) { << 303 if (hmm_pte_need_fault(hmm_vma << 304 pte_unmap(ptep); << 305 return -EFAULT; << 306 } << 307 *hmm_pfn = HMM_PFN_ERROR; << 308 return 0; << 309 } << 310 << 311 *hmm_pfn = pte_pfn(pte) | cpu_flags; << 312 return 0; 565 return 0; 313 566 314 fault: 567 fault: 315 pte_unmap(ptep); 568 pte_unmap(ptep); 316 /* Fault any virtual address we were a 569 /* Fault any virtual address we were asked to fault */ 317 return hmm_vma_fault(addr, end, requir !! 570 return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); 318 } 571 } 319 572 320 static int hmm_vma_walk_pmd(pmd_t *pmdp, 573 static int hmm_vma_walk_pmd(pmd_t *pmdp, 321 unsigned long star 574 unsigned long start, 322 unsigned long end, 575 unsigned long end, 323 struct mm_walk *wa 576 struct mm_walk *walk) 324 { 577 { 325 struct hmm_vma_walk *hmm_vma_walk = wa 578 struct hmm_vma_walk *hmm_vma_walk = walk->private; 326 struct hmm_range *range = hmm_vma_walk 579 struct hmm_range *range = hmm_vma_walk->range; 327 unsigned long *hmm_pfns = !! 580 uint64_t *pfns = range->pfns; 328 &range->hmm_pfns[(start - rang !! 581 unsigned long addr = start, i; 329 unsigned long npages = (end - start) > << 330 unsigned long addr = start; << 331 pte_t *ptep; 582 pte_t *ptep; 332 pmd_t pmd; !! 583 >> 584 i = (addr - range->start) >> PAGE_SHIFT; 333 585 334 again: 586 again: 335 pmd = pmdp_get_lockless(pmdp); !! 587 if (pmd_none(*pmdp)) 336 if (pmd_none(pmd)) !! 588 return hmm_vma_walk_hole(start, end, walk); 337 return hmm_vma_walk_hole(start << 338 589 339 if (thp_migration_supported() && is_pm !! 590 if (pmd_huge(*pmdp) && (range->vma->vm_flags & VM_HUGETLB)) 340 if (hmm_range_need_fault(hmm_v !! 591 return hmm_pfns_bad(start, end, walk); 341 hmm_vma_walk->last = a << 342 pmd_migration_entry_wa << 343 return -EBUSY; << 344 } << 345 return hmm_pfns_fill(start, en << 346 } << 347 592 348 if (!pmd_present(pmd)) { !! 593 if (pmd_devmap(*pmdp) || pmd_trans_huge(*pmdp)) { 349 if (hmm_range_need_fault(hmm_v !! 594 pmd_t pmd; 350 return -EFAULT; << 351 return hmm_pfns_fill(start, en << 352 } << 353 595 354 if (pmd_devmap(pmd) || pmd_trans_huge( << 355 /* 596 /* 356 * No need to take pmd_lock he !! 597 * No need to take pmd_lock here, even if some other threads 357 * is splitting the huge pmd w 598 * is splitting the huge pmd we will get that event through 358 * mmu_notifier callback. 599 * mmu_notifier callback. 359 * 600 * 360 * So just read pmd value and !! 601 * So just read pmd value and check again its a transparent 361 * huge or device mapping one 602 * huge or device mapping one and compute corresponding pfn 362 * values. 603 * values. 363 */ 604 */ 364 pmd = pmdp_get_lockless(pmdp); !! 605 pmd = pmd_read_atomic(pmdp); >> 606 barrier(); 365 if (!pmd_devmap(pmd) && !pmd_t 607 if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) 366 goto again; 608 goto again; 367 609 368 return hmm_vma_handle_pmd(walk !! 610 return hmm_vma_handle_pmd(walk, addr, end, &pfns[i], pmd); 369 } 611 } 370 612 371 /* !! 613 if (pmd_bad(*pmdp)) 372 * We have handled all the valid cases !! 614 return hmm_pfns_bad(start, end, walk); 373 * huge or transparent huge. At this p << 374 * entry pointing to pte directory or << 375 * recover. << 376 */ << 377 if (pmd_bad(pmd)) { << 378 if (hmm_range_need_fault(hmm_v << 379 return -EFAULT; << 380 return hmm_pfns_fill(start, en << 381 } << 382 615 383 ptep = pte_offset_map(pmdp, addr); 616 ptep = pte_offset_map(pmdp, addr); 384 if (!ptep) !! 617 for (; addr < end; addr += PAGE_SIZE, ptep++, i++) { 385 goto again; << 386 for (; addr < end; addr += PAGE_SIZE, << 387 int r; 618 int r; 388 619 389 r = hmm_vma_handle_pte(walk, a !! 620 r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, &pfns[i]); 390 if (r) { 621 if (r) { 391 /* hmm_vma_handle_pte( !! 622 /* hmm_vma_handle_pte() did unmap pte directory */ >> 623 hmm_vma_walk->last = addr; 392 return r; 624 return r; 393 } 625 } 394 } 626 } 395 pte_unmap(ptep - 1); 627 pte_unmap(ptep - 1); >> 628 >> 629 hmm_vma_walk->last = addr; 396 return 0; 630 return 0; 397 } 631 } 398 632 399 #if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \ !! 633 static void hmm_pfns_clear(struct hmm_range *range, 400 defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEP !! 634 uint64_t *pfns, 401 static inline unsigned long pud_to_hmm_pfn_fla !! 635 unsigned long addr, 402 !! 636 unsigned long end) 403 { 637 { 404 if (!pud_present(pud)) !! 638 for (; addr < end; addr += PAGE_SIZE, pfns++) 405 return 0; !! 639 *pfns = range->values[HMM_PFN_NONE]; 406 return (pud_write(pud) ? (HMM_PFN_VALI << 407 HMM_PFN_VALID << 408 hmm_pfn_flags_order(PUD_SHIFT - << 409 } 640 } 410 641 411 static int hmm_vma_walk_pud(pud_t *pudp, unsig !! 642 static void hmm_pfns_special(struct hmm_range *range) 412 struct mm_walk *walk) << 413 { 643 { 414 struct hmm_vma_walk *hmm_vma_walk = wa !! 644 unsigned long addr = range->start, i = 0; 415 struct hmm_range *range = hmm_vma_walk << 416 unsigned long addr = start; << 417 pud_t pud; << 418 spinlock_t *ptl = pud_trans_huge_lock( << 419 << 420 if (!ptl) << 421 return 0; << 422 645 423 /* Normally we don't want to split the !! 646 for (; addr < range->end; addr += PAGE_SIZE, i++) 424 walk->action = ACTION_CONTINUE; !! 647 range->pfns[i] = range->values[HMM_PFN_SPECIAL]; >> 648 } 425 649 426 pud = READ_ONCE(*pudp); !! 650 /* 427 if (!pud_present(pud)) { !! 651 * hmm_vma_get_pfns() - snapshot CPU page table for a range of virtual addresses 428 spin_unlock(ptl); !! 652 * @range: range being snapshotted 429 return hmm_vma_walk_hole(start !! 653 * Returns: -EINVAL if invalid argument, -ENOMEM out of memory, -EPERM invalid 430 } !! 654 * vma permission, 0 success 431 !! 655 * 432 if (pud_leaf(pud) && pud_devmap(pud)) !! 656 * This snapshots the CPU page table for a range of virtual addresses. Snapshot 433 unsigned long i, npages, pfn; !! 657 * validity is tracked by range struct. See hmm_vma_range_done() for further 434 unsigned int required_fault; !! 658 * information. 435 unsigned long *hmm_pfns; !! 659 * 436 unsigned long cpu_flags; !! 660 * The range struct is initialized here. It tracks the CPU page table, but only 437 !! 661 * if the function returns success (0), in which case the caller must then call 438 i = (addr - range->start) >> P !! 662 * hmm_vma_range_done() to stop CPU page table update tracking on this range. 439 npages = (end - addr) >> PAGE_ !! 663 * 440 hmm_pfns = &range->hmm_pfns[i] !! 664 * NOT CALLING hmm_vma_range_done() IF FUNCTION RETURNS 0 WILL LEAD TO SERIOUS 441 !! 665 * MEMORY CORRUPTION ! YOU HAVE BEEN WARNED ! 442 cpu_flags = pud_to_hmm_pfn_fla !! 666 */ 443 required_fault = hmm_range_nee !! 667 int hmm_vma_get_pfns(struct hmm_range *range) 444 !! 668 { 445 if (required_fault) { !! 669 struct vm_area_struct *vma = range->vma; 446 spin_unlock(ptl); !! 670 struct hmm_vma_walk hmm_vma_walk; 447 return hmm_vma_fault(a !! 671 struct mm_walk mm_walk; 448 } !! 672 struct hmm *hmm; >> 673 >> 674 /* Sanity check, this really should not happen ! */ >> 675 if (range->start < vma->vm_start || range->start >= vma->vm_end) >> 676 return -EINVAL; >> 677 if (range->end < vma->vm_start || range->end > vma->vm_end) >> 678 return -EINVAL; >> 679 >> 680 hmm = hmm_register(vma->vm_mm); >> 681 if (!hmm) >> 682 return -ENOMEM; >> 683 /* Caller must have registered a mirror, via hmm_mirror_register() ! */ >> 684 if (!hmm->mmu_notifier.ops) >> 685 return -EINVAL; >> 686 >> 687 /* FIXME support hugetlb fs */ >> 688 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { >> 689 hmm_pfns_special(range); >> 690 return -EINVAL; >> 691 } 449 692 450 pfn = pud_pfn(pud) + ((addr & !! 693 if (!(vma->vm_flags & VM_READ)) { 451 for (i = 0; i < npages; ++i, + !! 694 /* 452 hmm_pfns[i] = pfn | cp !! 695 * If vma do not allow read access, then assume that it does 453 goto out_unlock; !! 696 * not allow write access, either. Architecture that allow >> 697 * write without read access are not supported by HMM, because >> 698 * operations such has atomic access would not work. >> 699 */ >> 700 hmm_pfns_clear(range, range->pfns, range->start, range->end); >> 701 return -EPERM; 454 } 702 } 455 703 456 /* Ask for the PUD to be split */ !! 704 /* Initialize range to track CPU page table update */ 457 walk->action = ACTION_SUBTREE; !! 705 spin_lock(&hmm->lock); >> 706 range->valid = true; >> 707 list_add_rcu(&range->list, &hmm->ranges); >> 708 spin_unlock(&hmm->lock); >> 709 >> 710 hmm_vma_walk.fault = false; >> 711 hmm_vma_walk.range = range; >> 712 mm_walk.private = &hmm_vma_walk; >> 713 >> 714 mm_walk.vma = vma; >> 715 mm_walk.mm = vma->vm_mm; >> 716 mm_walk.pte_entry = NULL; >> 717 mm_walk.test_walk = NULL; >> 718 mm_walk.hugetlb_entry = NULL; >> 719 mm_walk.pmd_entry = hmm_vma_walk_pmd; >> 720 mm_walk.pte_hole = hmm_vma_walk_hole; 458 721 459 out_unlock: !! 722 walk_page_range(range->start, range->end, &mm_walk); 460 spin_unlock(ptl); << 461 return 0; 723 return 0; 462 } 724 } 463 #else !! 725 EXPORT_SYMBOL(hmm_vma_get_pfns); 464 #define hmm_vma_walk_pud NULL !! 726 465 #endif !! 727 /* 466 !! 728 * hmm_vma_range_done() - stop tracking change to CPU page table over a range 467 #ifdef CONFIG_HUGETLB_PAGE !! 729 * @range: range being tracked 468 static int hmm_vma_walk_hugetlb_entry(pte_t *p !! 730 * Returns: false if range data has been invalidated, true otherwise 469 unsigned !! 731 * 470 struct m !! 732 * Range struct is used to track updates to the CPU page table after a call to >> 733 * either hmm_vma_get_pfns() or hmm_vma_fault(). Once the device driver is done >> 734 * using the data, or wants to lock updates to the data it got from those >> 735 * functions, it must call the hmm_vma_range_done() function, which will then >> 736 * stop tracking CPU page table updates. >> 737 * >> 738 * Note that device driver must still implement general CPU page table update >> 739 * tracking either by using hmm_mirror (see hmm_mirror_register()) or by using >> 740 * the mmu_notifier API directly. >> 741 * >> 742 * CPU page table update tracking done through hmm_range is only temporary and >> 743 * to be used while trying to duplicate CPU page table contents for a range of >> 744 * virtual addresses. >> 745 * >> 746 * There are two ways to use this : >> 747 * again: >> 748 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); >> 749 * trans = device_build_page_table_update_transaction(pfns); >> 750 * device_page_table_lock(); >> 751 * if (!hmm_vma_range_done(range)) { >> 752 * device_page_table_unlock(); >> 753 * goto again; >> 754 * } >> 755 * device_commit_transaction(trans); >> 756 * device_page_table_unlock(); >> 757 * >> 758 * Or: >> 759 * hmm_vma_get_pfns(range); or hmm_vma_fault(...); >> 760 * device_page_table_lock(); >> 761 * hmm_vma_range_done(range); >> 762 * device_update_page_table(range->pfns); >> 763 * device_page_table_unlock(); >> 764 */ >> 765 bool hmm_vma_range_done(struct hmm_range *range) 471 { 766 { 472 unsigned long addr = start, i, pfn; !! 767 unsigned long npages = (range->end - range->start) >> PAGE_SHIFT; 473 struct hmm_vma_walk *hmm_vma_walk = wa !! 768 struct hmm *hmm; 474 struct hmm_range *range = hmm_vma_walk !! 769 475 struct vm_area_struct *vma = walk->vma !! 770 if (range->end <= range->start) { 476 unsigned int required_fault; !! 771 BUG(); 477 unsigned long pfn_req_flags; !! 772 return false; 478 unsigned long cpu_flags; !! 773 } 479 spinlock_t *ptl; !! 774 480 pte_t entry; !! 775 hmm = hmm_register(range->vma->vm_mm); 481 !! 776 if (!hmm) { 482 ptl = huge_pte_lock(hstate_vma(vma), w !! 777 memset(range->pfns, 0, sizeof(*range->pfns) * npages); 483 entry = huge_ptep_get(walk->mm, addr, !! 778 return false; 484 !! 779 } 485 i = (start - range->start) >> PAGE_SHI << 486 pfn_req_flags = range->hmm_pfns[i]; << 487 cpu_flags = pte_to_hmm_pfn_flags(range << 488 hmm_pfn_flags_order(huge_p << 489 required_fault = << 490 hmm_pte_need_fault(hmm_vma_wal << 491 if (required_fault) { << 492 int ret; << 493 780 494 spin_unlock(ptl); !! 781 spin_lock(&hmm->lock); 495 hugetlb_vma_unlock_read(vma); !! 782 list_del_rcu(&range->list); >> 783 spin_unlock(&hmm->lock); >> 784 >> 785 return range->valid; >> 786 } >> 787 EXPORT_SYMBOL(hmm_vma_range_done); >> 788 >> 789 /* >> 790 * hmm_vma_fault() - try to fault some address in a virtual address range >> 791 * @range: range being faulted >> 792 * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) >> 793 * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) >> 794 * >> 795 * This is similar to a regular CPU page fault except that it will not trigger >> 796 * any memory migration if the memory being faulted is not accessible by CPUs. >> 797 * >> 798 * On error, for one virtual address in the range, the function will mark the >> 799 * corresponding HMM pfn entry with an error flag. >> 800 * >> 801 * Expected use pattern: >> 802 * retry: >> 803 * down_read(&mm->mmap_sem); >> 804 * // Find vma and address device wants to fault, initialize hmm_pfn_t >> 805 * // array accordingly >> 806 * ret = hmm_vma_fault(range, write, block); >> 807 * switch (ret) { >> 808 * case -EAGAIN: >> 809 * hmm_vma_range_done(range); >> 810 * // You might want to rate limit or yield to play nicely, you may >> 811 * // also commit any valid pfn in the array assuming that you are >> 812 * // getting true from hmm_vma_range_monitor_end() >> 813 * goto retry; >> 814 * case 0: >> 815 * break; >> 816 * case -ENOMEM: >> 817 * case -EINVAL: >> 818 * case -EPERM: >> 819 * default: >> 820 * // Handle error ! >> 821 * up_read(&mm->mmap_sem) >> 822 * return; >> 823 * } >> 824 * // Take device driver lock that serialize device page table update >> 825 * driver_lock_device_page_table_update(); >> 826 * hmm_vma_range_done(range); >> 827 * // Commit pfns we got from hmm_vma_fault() >> 828 * driver_unlock_device_page_table_update(); >> 829 * up_read(&mm->mmap_sem) >> 830 * >> 831 * YOU MUST CALL hmm_vma_range_done() AFTER THIS FUNCTION RETURN SUCCESS (0) >> 832 * BEFORE FREEING THE range struct OR YOU WILL HAVE SERIOUS MEMORY CORRUPTION ! >> 833 * >> 834 * YOU HAVE BEEN WARNED ! >> 835 */ >> 836 int hmm_vma_fault(struct hmm_range *range, bool block) >> 837 { >> 838 struct vm_area_struct *vma = range->vma; >> 839 unsigned long start = range->start; >> 840 struct hmm_vma_walk hmm_vma_walk; >> 841 struct mm_walk mm_walk; >> 842 struct hmm *hmm; >> 843 int ret; >> 844 >> 845 /* Sanity check, this really should not happen ! */ >> 846 if (range->start < vma->vm_start || range->start >= vma->vm_end) >> 847 return -EINVAL; >> 848 if (range->end < vma->vm_start || range->end > vma->vm_end) >> 849 return -EINVAL; >> 850 >> 851 hmm = hmm_register(vma->vm_mm); >> 852 if (!hmm) { >> 853 hmm_pfns_clear(range, range->pfns, range->start, range->end); >> 854 return -ENOMEM; >> 855 } >> 856 /* Caller must have registered a mirror using hmm_mirror_register() */ >> 857 if (!hmm->mmu_notifier.ops) >> 858 return -EINVAL; >> 859 >> 860 /* FIXME support hugetlb fs */ >> 861 if (is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_SPECIAL)) { >> 862 hmm_pfns_special(range); >> 863 return -EINVAL; >> 864 } >> 865 >> 866 if (!(vma->vm_flags & VM_READ)) { 496 /* 867 /* 497 * Avoid deadlock: drop the vm !! 868 * If vma do not allow read access, then assume that it does 498 * hmm_vma_fault(), which will !! 869 * not allow write access, either. Architecture that allow 499 * drop the vma lock. This is !! 870 * write without read access are not supported by HMM, because 500 * protection point of view, b !! 871 * operations such has atomic access would not work. 501 * use here of either pte or p << 502 * lock. << 503 */ 872 */ 504 ret = hmm_vma_fault(addr, end, !! 873 hmm_pfns_clear(range, range->pfns, range->start, range->end); 505 hugetlb_vma_lock_read(vma); !! 874 return -EPERM; 506 return ret; << 507 } 875 } 508 876 509 pfn = pte_pfn(entry) + ((start & ~hmas !! 877 /* Initialize range to track CPU page table update */ 510 for (; addr < end; addr += PAGE_SIZE, !! 878 spin_lock(&hmm->lock); 511 range->hmm_pfns[i] = pfn | cpu !! 879 range->valid = true; >> 880 list_add_rcu(&range->list, &hmm->ranges); >> 881 spin_unlock(&hmm->lock); >> 882 >> 883 hmm_vma_walk.fault = true; >> 884 hmm_vma_walk.block = block; >> 885 hmm_vma_walk.range = range; >> 886 mm_walk.private = &hmm_vma_walk; >> 887 hmm_vma_walk.last = range->start; >> 888 >> 889 mm_walk.vma = vma; >> 890 mm_walk.mm = vma->vm_mm; >> 891 mm_walk.pte_entry = NULL; >> 892 mm_walk.test_walk = NULL; >> 893 mm_walk.hugetlb_entry = NULL; >> 894 mm_walk.pmd_entry = hmm_vma_walk_pmd; >> 895 mm_walk.pte_hole = hmm_vma_walk_hole; 512 896 513 spin_unlock(ptl); !! 897 do { 514 return 0; !! 898 ret = walk_page_range(start, range->end, &mm_walk); >> 899 start = hmm_vma_walk.last; >> 900 } while (ret == -EAGAIN); >> 901 >> 902 if (ret) { >> 903 unsigned long i; >> 904 >> 905 i = (hmm_vma_walk.last - range->start) >> PAGE_SHIFT; >> 906 hmm_pfns_clear(range, &range->pfns[i], hmm_vma_walk.last, >> 907 range->end); >> 908 hmm_vma_range_done(range); >> 909 } >> 910 return ret; 515 } 911 } 516 #else !! 912 EXPORT_SYMBOL(hmm_vma_fault); 517 #define hmm_vma_walk_hugetlb_entry NULL !! 913 #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ 518 #endif /* CONFIG_HUGETLB_PAGE */ << 519 914 520 static int hmm_vma_walk_test(unsigned long sta !! 915 521 struct mm_walk *w !! 916 #if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) >> 917 struct page *hmm_vma_alloc_locked_page(struct vm_area_struct *vma, >> 918 unsigned long addr) 522 { 919 { 523 struct hmm_vma_walk *hmm_vma_walk = wa !! 920 struct page *page; 524 struct hmm_range *range = hmm_vma_walk << 525 struct vm_area_struct *vma = walk->vma << 526 921 527 if (!(vma->vm_flags & (VM_IO | VM_PFNM !! 922 page = alloc_page_vma(GFP_HIGHUSER, vma, addr); 528 vma->vm_flags & VM_READ) !! 923 if (!page) 529 return 0; !! 924 return NULL; >> 925 lock_page(page); >> 926 return page; >> 927 } >> 928 EXPORT_SYMBOL(hmm_vma_alloc_locked_page); >> 929 >> 930 >> 931 static void hmm_devmem_ref_release(struct percpu_ref *ref) >> 932 { >> 933 struct hmm_devmem *devmem; >> 934 >> 935 devmem = container_of(ref, struct hmm_devmem, ref); >> 936 complete(&devmem->completion); >> 937 } >> 938 >> 939 static void hmm_devmem_ref_exit(void *data) >> 940 { >> 941 struct percpu_ref *ref = data; >> 942 struct hmm_devmem *devmem; >> 943 >> 944 devmem = container_of(ref, struct hmm_devmem, ref); >> 945 percpu_ref_exit(ref); >> 946 devm_remove_action(devmem->device, &hmm_devmem_ref_exit, data); >> 947 } >> 948 >> 949 static void hmm_devmem_ref_kill(void *data) >> 950 { >> 951 struct percpu_ref *ref = data; >> 952 struct hmm_devmem *devmem; >> 953 >> 954 devmem = container_of(ref, struct hmm_devmem, ref); >> 955 percpu_ref_kill(ref); >> 956 wait_for_completion(&devmem->completion); >> 957 devm_remove_action(devmem->device, &hmm_devmem_ref_kill, data); >> 958 } >> 959 >> 960 static int hmm_devmem_fault(struct vm_area_struct *vma, >> 961 unsigned long addr, >> 962 const struct page *page, >> 963 unsigned int flags, >> 964 pmd_t *pmdp) >> 965 { >> 966 struct hmm_devmem *devmem = page->pgmap->data; >> 967 >> 968 return devmem->ops->fault(devmem, vma, addr, page, flags, pmdp); >> 969 } >> 970 >> 971 static void hmm_devmem_free(struct page *page, void *data) >> 972 { >> 973 struct hmm_devmem *devmem = data; >> 974 >> 975 devmem->ops->free(devmem, page); >> 976 } >> 977 >> 978 static DEFINE_MUTEX(hmm_devmem_lock); >> 979 static RADIX_TREE(hmm_devmem_radix, GFP_KERNEL); >> 980 >> 981 static void hmm_devmem_radix_release(struct resource *resource) >> 982 { >> 983 resource_size_t key, align_start, align_size; >> 984 >> 985 align_start = resource->start & ~(PA_SECTION_SIZE - 1); >> 986 align_size = ALIGN(resource_size(resource), PA_SECTION_SIZE); >> 987 >> 988 mutex_lock(&hmm_devmem_lock); >> 989 for (key = resource->start; >> 990 key <= resource->end; >> 991 key += PA_SECTION_SIZE) >> 992 radix_tree_delete(&hmm_devmem_radix, key >> PA_SECTION_SHIFT); >> 993 mutex_unlock(&hmm_devmem_lock); >> 994 } >> 995 >> 996 static void hmm_devmem_release(struct device *dev, void *data) >> 997 { >> 998 struct hmm_devmem *devmem = data; >> 999 struct resource *resource = devmem->resource; >> 1000 unsigned long start_pfn, npages; >> 1001 struct zone *zone; >> 1002 struct page *page; >> 1003 >> 1004 if (percpu_ref_tryget_live(&devmem->ref)) { >> 1005 dev_WARN(dev, "%s: page mapping is still live!\n", __func__); >> 1006 percpu_ref_put(&devmem->ref); >> 1007 } >> 1008 >> 1009 /* pages are dead and unused, undo the arch mapping */ >> 1010 start_pfn = (resource->start & ~(PA_SECTION_SIZE - 1)) >> PAGE_SHIFT; >> 1011 npages = ALIGN(resource_size(resource), PA_SECTION_SIZE) >> PAGE_SHIFT; >> 1012 >> 1013 page = pfn_to_page(start_pfn); >> 1014 zone = page_zone(page); >> 1015 >> 1016 mem_hotplug_begin(); >> 1017 if (resource->desc == IORES_DESC_DEVICE_PRIVATE_MEMORY) >> 1018 __remove_pages(zone, start_pfn, npages, NULL); >> 1019 else >> 1020 arch_remove_memory(start_pfn << PAGE_SHIFT, >> 1021 npages << PAGE_SHIFT, NULL); >> 1022 mem_hotplug_done(); >> 1023 >> 1024 hmm_devmem_radix_release(resource); >> 1025 } >> 1026 >> 1027 static int hmm_devmem_pages_create(struct hmm_devmem *devmem) >> 1028 { >> 1029 resource_size_t key, align_start, align_size, align_end; >> 1030 struct device *device = devmem->device; >> 1031 int ret, nid, is_ram; >> 1032 unsigned long pfn; >> 1033 >> 1034 align_start = devmem->resource->start & ~(PA_SECTION_SIZE - 1); >> 1035 align_size = ALIGN(devmem->resource->start + >> 1036 resource_size(devmem->resource), >> 1037 PA_SECTION_SIZE) - align_start; >> 1038 >> 1039 is_ram = region_intersects(align_start, align_size, >> 1040 IORESOURCE_SYSTEM_RAM, >> 1041 IORES_DESC_NONE); >> 1042 if (is_ram == REGION_MIXED) { >> 1043 WARN_ONCE(1, "%s attempted on mixed region %pr\n", >> 1044 __func__, devmem->resource); >> 1045 return -ENXIO; >> 1046 } >> 1047 if (is_ram == REGION_INTERSECTS) >> 1048 return -ENXIO; 530 1049 >> 1050 if (devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY) >> 1051 devmem->pagemap.type = MEMORY_DEVICE_PUBLIC; >> 1052 else >> 1053 devmem->pagemap.type = MEMORY_DEVICE_PRIVATE; >> 1054 >> 1055 devmem->pagemap.res = *devmem->resource; >> 1056 devmem->pagemap.page_fault = hmm_devmem_fault; >> 1057 devmem->pagemap.page_free = hmm_devmem_free; >> 1058 devmem->pagemap.dev = devmem->device; >> 1059 devmem->pagemap.ref = &devmem->ref; >> 1060 devmem->pagemap.data = devmem; >> 1061 >> 1062 mutex_lock(&hmm_devmem_lock); >> 1063 align_end = align_start + align_size - 1; >> 1064 for (key = align_start; key <= align_end; key += PA_SECTION_SIZE) { >> 1065 struct hmm_devmem *dup; >> 1066 >> 1067 dup = radix_tree_lookup(&hmm_devmem_radix, >> 1068 key >> PA_SECTION_SHIFT); >> 1069 if (dup) { >> 1070 dev_err(device, "%s: collides with mapping for %s\n", >> 1071 __func__, dev_name(dup->device)); >> 1072 mutex_unlock(&hmm_devmem_lock); >> 1073 ret = -EBUSY; >> 1074 goto error; >> 1075 } >> 1076 ret = radix_tree_insert(&hmm_devmem_radix, >> 1077 key >> PA_SECTION_SHIFT, >> 1078 devmem); >> 1079 if (ret) { >> 1080 dev_err(device, "%s: failed: %d\n", __func__, ret); >> 1081 mutex_unlock(&hmm_devmem_lock); >> 1082 goto error_radix; >> 1083 } >> 1084 } >> 1085 mutex_unlock(&hmm_devmem_lock); >> 1086 >> 1087 nid = dev_to_node(device); >> 1088 if (nid < 0) >> 1089 nid = numa_mem_id(); >> 1090 >> 1091 mem_hotplug_begin(); 531 /* 1092 /* 532 * vma ranges that don't have struct p !! 1093 * For device private memory we call add_pages() as we only need to 533 * devices directly cannot be handled !! 1094 * allocate and initialize struct page for the device memory. More- >> 1095 * over the device memory is un-accessible thus we do not want to >> 1096 * create a linear mapping for the memory like arch_add_memory() >> 1097 * would do. 534 * 1098 * 535 * If the vma does not allow read acce !! 1099 * For device public memory, which is accesible by the CPU, we do 536 * allow write access either. HMM does !! 1100 * want the linear mapping and thus use arch_add_memory(). 537 * allow write without read. !! 1101 */ >> 1102 if (devmem->pagemap.type == MEMORY_DEVICE_PUBLIC) >> 1103 ret = arch_add_memory(nid, align_start, align_size, NULL, >> 1104 false); >> 1105 else >> 1106 ret = add_pages(nid, align_start >> PAGE_SHIFT, >> 1107 align_size >> PAGE_SHIFT, NULL, false); >> 1108 if (ret) { >> 1109 mem_hotplug_done(); >> 1110 goto error_add_memory; >> 1111 } >> 1112 move_pfn_range_to_zone(&NODE_DATA(nid)->node_zones[ZONE_DEVICE], >> 1113 align_start >> PAGE_SHIFT, >> 1114 align_size >> PAGE_SHIFT, NULL); >> 1115 mem_hotplug_done(); >> 1116 >> 1117 for (pfn = devmem->pfn_first; pfn < devmem->pfn_last; pfn++) { >> 1118 struct page *page = pfn_to_page(pfn); >> 1119 >> 1120 page->pgmap = &devmem->pagemap; >> 1121 } >> 1122 return 0; >> 1123 >> 1124 error_add_memory: >> 1125 untrack_pfn(NULL, PHYS_PFN(align_start), align_size); >> 1126 error_radix: >> 1127 hmm_devmem_radix_release(devmem->resource); >> 1128 error: >> 1129 return ret; >> 1130 } >> 1131 >> 1132 static int hmm_devmem_match(struct device *dev, void *data, void *match_data) >> 1133 { >> 1134 struct hmm_devmem *devmem = data; >> 1135 >> 1136 return devmem->resource == match_data; >> 1137 } >> 1138 >> 1139 static void hmm_devmem_pages_remove(struct hmm_devmem *devmem) >> 1140 { >> 1141 devres_release(devmem->device, &hmm_devmem_release, >> 1142 &hmm_devmem_match, devmem->resource); >> 1143 } >> 1144 >> 1145 /* >> 1146 * hmm_devmem_add() - hotplug ZONE_DEVICE memory for device memory >> 1147 * >> 1148 * @ops: memory event device driver callback (see struct hmm_devmem_ops) >> 1149 * @device: device struct to bind the resource too >> 1150 * @size: size in bytes of the device memory to add >> 1151 * Returns: pointer to new hmm_devmem struct ERR_PTR otherwise >> 1152 * >> 1153 * This function first finds an empty range of physical address big enough to >> 1154 * contain the new resource, and then hotplugs it as ZONE_DEVICE memory, which >> 1155 * in turn allocates struct pages. It does not do anything beyond that; all >> 1156 * events affecting the memory will go through the various callbacks provided >> 1157 * by hmm_devmem_ops struct. >> 1158 * >> 1159 * Device driver should call this function during device initialization and >> 1160 * is then responsible of memory management. HMM only provides helpers. >> 1161 */ >> 1162 struct hmm_devmem *hmm_devmem_add(const struct hmm_devmem_ops *ops, >> 1163 struct device *device, >> 1164 unsigned long size) >> 1165 { >> 1166 struct hmm_devmem *devmem; >> 1167 resource_size_t addr; >> 1168 int ret; >> 1169 >> 1170 static_branch_enable(&device_private_key); >> 1171 >> 1172 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), >> 1173 GFP_KERNEL, dev_to_node(device)); >> 1174 if (!devmem) >> 1175 return ERR_PTR(-ENOMEM); >> 1176 >> 1177 init_completion(&devmem->completion); >> 1178 devmem->pfn_first = -1UL; >> 1179 devmem->pfn_last = -1UL; >> 1180 devmem->resource = NULL; >> 1181 devmem->device = device; >> 1182 devmem->ops = ops; >> 1183 >> 1184 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, >> 1185 0, GFP_KERNEL); >> 1186 if (ret) >> 1187 goto error_percpu_ref; >> 1188 >> 1189 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); >> 1190 if (ret) >> 1191 goto error_devm_add_action; >> 1192 >> 1193 size = ALIGN(size, PA_SECTION_SIZE); >> 1194 addr = min((unsigned long)iomem_resource.end, >> 1195 (1UL << MAX_PHYSMEM_BITS) - 1); >> 1196 addr = addr - size + 1UL; >> 1197 >> 1198 /* >> 1199 * FIXME add a new helper to quickly walk resource tree and find free >> 1200 * range 538 * 1201 * 539 * If a fault is requested for an unsu !! 1202 * FIXME what about ioport_resource resource ? 540 * failure. << 541 */ 1203 */ 542 if (hmm_range_need_fault(hmm_vma_walk, !! 1204 for (; addr > size && addr >= iomem_resource.start; addr -= size) { 543 range->hmm_pf !! 1205 ret = region_intersects(addr, size, 0, IORES_DESC_NONE); 544 ((sta !! 1206 if (ret != REGION_DISJOINT) 545 (end - start) !! 1207 continue; 546 return -EFAULT; !! 1208 >> 1209 devmem->resource = devm_request_mem_region(device, addr, size, >> 1210 dev_name(device)); >> 1211 if (!devmem->resource) { >> 1212 ret = -ENOMEM; >> 1213 goto error_no_resource; >> 1214 } >> 1215 break; >> 1216 } >> 1217 if (!devmem->resource) { >> 1218 ret = -ERANGE; >> 1219 goto error_no_resource; >> 1220 } >> 1221 >> 1222 devmem->resource->desc = IORES_DESC_DEVICE_PRIVATE_MEMORY; >> 1223 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; >> 1224 devmem->pfn_last = devmem->pfn_first + >> 1225 (resource_size(devmem->resource) >> PAGE_SHIFT); >> 1226 >> 1227 ret = hmm_devmem_pages_create(devmem); >> 1228 if (ret) >> 1229 goto error_pages; >> 1230 >> 1231 devres_add(device, devmem); >> 1232 >> 1233 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); >> 1234 if (ret) { >> 1235 hmm_devmem_remove(devmem); >> 1236 return ERR_PTR(ret); >> 1237 } 547 1238 548 hmm_pfns_fill(start, end, range, HMM_P !! 1239 return devmem; 549 1240 550 /* Skip this vma and continue processi !! 1241 error_pages: 551 return 1; !! 1242 devm_release_mem_region(device, devmem->resource->start, >> 1243 resource_size(devmem->resource)); >> 1244 error_no_resource: >> 1245 error_devm_add_action: >> 1246 hmm_devmem_ref_kill(&devmem->ref); >> 1247 hmm_devmem_ref_exit(&devmem->ref); >> 1248 error_percpu_ref: >> 1249 devres_free(devmem); >> 1250 return ERR_PTR(ret); 552 } 1251 } >> 1252 EXPORT_SYMBOL(hmm_devmem_add); 553 1253 554 static const struct mm_walk_ops hmm_walk_ops = !! 1254 struct hmm_devmem *hmm_devmem_add_resource(const struct hmm_devmem_ops *ops, 555 .pud_entry = hmm_vma_walk_pud, !! 1255 struct device *device, 556 .pmd_entry = hmm_vma_walk_pmd, !! 1256 struct resource *res) 557 .pte_hole = hmm_vma_walk_hole, !! 1257 { 558 .hugetlb_entry = hmm_vma_walk_hugetlb !! 1258 struct hmm_devmem *devmem; 559 .test_walk = hmm_vma_walk_test, !! 1259 int ret; 560 .walk_lock = PGWALK_RDLOCK, !! 1260 561 }; !! 1261 if (res->desc != IORES_DESC_DEVICE_PUBLIC_MEMORY) >> 1262 return ERR_PTR(-EINVAL); 562 1263 563 /** !! 1264 static_branch_enable(&device_private_key); 564 * hmm_range_fault - try to fault some address !! 1265 565 * @range: argument structure !! 1266 devmem = devres_alloc_node(&hmm_devmem_release, sizeof(*devmem), 566 * !! 1267 GFP_KERNEL, dev_to_node(device)); 567 * Returns 0 on success or one of the followin !! 1268 if (!devmem) 568 * !! 1269 return ERR_PTR(-ENOMEM); 569 * -EINVAL: Invalid arguments or mm or vir !! 1270 570 * (e.g., device file vma). !! 1271 init_completion(&devmem->completion); 571 * -ENOMEM: Out of memory. !! 1272 devmem->pfn_first = -1UL; 572 * -EPERM: Invalid permission (e.g., aski !! 1273 devmem->pfn_last = -1UL; 573 * only). !! 1274 devmem->resource = res; 574 * -EBUSY: The range has been invalidated !! 1275 devmem->device = device; 575 * the invalidation to finish. !! 1276 devmem->ops = ops; 576 * -EFAULT: A page was requested to be val !! 1277 577 * ie it has no backing VMA or it !! 1278 ret = percpu_ref_init(&devmem->ref, &hmm_devmem_ref_release, 578 * !! 1279 0, GFP_KERNEL); 579 * This is similar to get_user_pages(), except !! 1280 if (ret) 580 * without mutating them (ie causing faults). !! 1281 goto error_percpu_ref; 581 */ !! 1282 582 int hmm_range_fault(struct hmm_range *range) !! 1283 ret = devm_add_action(device, hmm_devmem_ref_exit, &devmem->ref); 583 { !! 1284 if (ret) 584 struct hmm_vma_walk hmm_vma_walk = { !! 1285 goto error_devm_add_action; 585 .range = range, !! 1286 586 .last = range->start, !! 1287 587 }; !! 1288 devmem->pfn_first = devmem->resource->start >> PAGE_SHIFT; 588 struct mm_struct *mm = range->notifier !! 1289 devmem->pfn_last = devmem->pfn_first + >> 1290 (resource_size(devmem->resource) >> PAGE_SHIFT); >> 1291 >> 1292 ret = hmm_devmem_pages_create(devmem); >> 1293 if (ret) >> 1294 goto error_devm_add_action; >> 1295 >> 1296 devres_add(device, devmem); >> 1297 >> 1298 ret = devm_add_action(device, hmm_devmem_ref_kill, &devmem->ref); >> 1299 if (ret) { >> 1300 hmm_devmem_remove(devmem); >> 1301 return ERR_PTR(ret); >> 1302 } >> 1303 >> 1304 return devmem; >> 1305 >> 1306 error_devm_add_action: >> 1307 hmm_devmem_ref_kill(&devmem->ref); >> 1308 hmm_devmem_ref_exit(&devmem->ref); >> 1309 error_percpu_ref: >> 1310 devres_free(devmem); >> 1311 return ERR_PTR(ret); >> 1312 } >> 1313 EXPORT_SYMBOL(hmm_devmem_add_resource); >> 1314 >> 1315 /* >> 1316 * hmm_devmem_remove() - remove device memory (kill and free ZONE_DEVICE) >> 1317 * >> 1318 * @devmem: hmm_devmem struct use to track and manage the ZONE_DEVICE memory >> 1319 * >> 1320 * This will hot-unplug memory that was hotplugged by hmm_devmem_add on behalf >> 1321 * of the device driver. It will free struct page and remove the resource that >> 1322 * reserved the physical address range for this device memory. >> 1323 */ >> 1324 void hmm_devmem_remove(struct hmm_devmem *devmem) >> 1325 { >> 1326 resource_size_t start, size; >> 1327 struct device *device; >> 1328 bool cdm = false; >> 1329 >> 1330 if (!devmem) >> 1331 return; >> 1332 >> 1333 device = devmem->device; >> 1334 start = devmem->resource->start; >> 1335 size = resource_size(devmem->resource); >> 1336 >> 1337 cdm = devmem->resource->desc == IORES_DESC_DEVICE_PUBLIC_MEMORY; >> 1338 hmm_devmem_ref_kill(&devmem->ref); >> 1339 hmm_devmem_ref_exit(&devmem->ref); >> 1340 hmm_devmem_pages_remove(devmem); >> 1341 >> 1342 if (!cdm) >> 1343 devm_release_mem_region(device, start, size); >> 1344 } >> 1345 EXPORT_SYMBOL(hmm_devmem_remove); >> 1346 >> 1347 /* >> 1348 * A device driver that wants to handle multiple devices memory through a >> 1349 * single fake device can use hmm_device to do so. This is purely a helper >> 1350 * and it is not needed to make use of any HMM functionality. >> 1351 */ >> 1352 #define HMM_DEVICE_MAX 256 >> 1353 >> 1354 static DECLARE_BITMAP(hmm_device_mask, HMM_DEVICE_MAX); >> 1355 static DEFINE_SPINLOCK(hmm_device_lock); >> 1356 static struct class *hmm_device_class; >> 1357 static dev_t hmm_device_devt; >> 1358 >> 1359 static void hmm_device_release(struct device *device) >> 1360 { >> 1361 struct hmm_device *hmm_device; >> 1362 >> 1363 hmm_device = container_of(device, struct hmm_device, device); >> 1364 spin_lock(&hmm_device_lock); >> 1365 clear_bit(hmm_device->minor, hmm_device_mask); >> 1366 spin_unlock(&hmm_device_lock); >> 1367 >> 1368 kfree(hmm_device); >> 1369 } >> 1370 >> 1371 struct hmm_device *hmm_device_new(void *drvdata) >> 1372 { >> 1373 struct hmm_device *hmm_device; >> 1374 >> 1375 hmm_device = kzalloc(sizeof(*hmm_device), GFP_KERNEL); >> 1376 if (!hmm_device) >> 1377 return ERR_PTR(-ENOMEM); >> 1378 >> 1379 spin_lock(&hmm_device_lock); >> 1380 hmm_device->minor = find_first_zero_bit(hmm_device_mask, HMM_DEVICE_MAX); >> 1381 if (hmm_device->minor >= HMM_DEVICE_MAX) { >> 1382 spin_unlock(&hmm_device_lock); >> 1383 kfree(hmm_device); >> 1384 return ERR_PTR(-EBUSY); >> 1385 } >> 1386 set_bit(hmm_device->minor, hmm_device_mask); >> 1387 spin_unlock(&hmm_device_lock); >> 1388 >> 1389 dev_set_name(&hmm_device->device, "hmm_device%d", hmm_device->minor); >> 1390 hmm_device->device.devt = MKDEV(MAJOR(hmm_device_devt), >> 1391 hmm_device->minor); >> 1392 hmm_device->device.release = hmm_device_release; >> 1393 dev_set_drvdata(&hmm_device->device, drvdata); >> 1394 hmm_device->device.class = hmm_device_class; >> 1395 device_initialize(&hmm_device->device); >> 1396 >> 1397 return hmm_device; >> 1398 } >> 1399 EXPORT_SYMBOL(hmm_device_new); >> 1400 >> 1401 void hmm_device_put(struct hmm_device *hmm_device) >> 1402 { >> 1403 put_device(&hmm_device->device); >> 1404 } >> 1405 EXPORT_SYMBOL(hmm_device_put); >> 1406 >> 1407 static int __init hmm_init(void) >> 1408 { 589 int ret; 1409 int ret; 590 1410 591 mmap_assert_locked(mm); !! 1411 ret = alloc_chrdev_region(&hmm_device_devt, 0, >> 1412 HMM_DEVICE_MAX, >> 1413 "hmm_device"); >> 1414 if (ret) >> 1415 return ret; 592 1416 593 do { !! 1417 hmm_device_class = class_create(THIS_MODULE, "hmm_device"); 594 /* If range is no longer valid !! 1418 if (IS_ERR(hmm_device_class)) { 595 if (mmu_interval_check_retry(r !! 1419 unregister_chrdev_region(hmm_device_devt, HMM_DEVICE_MAX); 596 r !! 1420 return PTR_ERR(hmm_device_class); 597 return -EBUSY; !! 1421 } 598 ret = walk_page_range(mm, hmm_ !! 1422 return 0; 599 &hmm_wal << 600 /* << 601 * When -EBUSY is returned the << 602 * hmm_vma_walk.last set to an << 603 * in pfns. All entries < last << 604 * output, and all >= are stil << 605 */ << 606 } while (ret == -EBUSY); << 607 return ret; << 608 } 1423 } 609 EXPORT_SYMBOL(hmm_range_fault); !! 1424 >> 1425 device_initcall(hmm_init); >> 1426 #endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ 610 1427
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.