~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/hugetlb_vmemmap.c

Version: ~ [ linux-6.11-rc3 ] ~ [ linux-6.10.4 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.45 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.104 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.164 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.223 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.281 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.319 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0
  2 /*
  3  * HugeTLB Vmemmap Optimization (HVO)
  4  *
  5  * Copyright (c) 2020, ByteDance. All rights reserved.
  6  *
  7  *     Author: Muchun Song <songmuchun@bytedance.com>
  8  *
  9  * See Documentation/mm/vmemmap_dedup.rst
 10  */
 11 #define pr_fmt(fmt)     "HugeTLB: " fmt
 12 
 13 #include <linux/pgtable.h>
 14 #include <linux/moduleparam.h>
 15 #include <linux/bootmem_info.h>
 16 #include <linux/mmdebug.h>
 17 #include <linux/pagewalk.h>
 18 #include <asm/pgalloc.h>
 19 #include <asm/tlbflush.h>
 20 #include "hugetlb_vmemmap.h"
 21 
 22 /**
 23  * struct vmemmap_remap_walk - walk vmemmap page table
 24  *
 25  * @remap_pte:          called for each lowest-level entry (PTE).
 26  * @nr_walked:          the number of walked pte.
 27  * @reuse_page:         the page which is reused for the tail vmemmap pages.
 28  * @reuse_addr:         the virtual address of the @reuse_page page.
 29  * @vmemmap_pages:      the list head of the vmemmap pages that can be freed
 30  *                      or is mapped from.
 31  * @flags:              used to modify behavior in vmemmap page table walking
 32  *                      operations.
 33  */
 34 struct vmemmap_remap_walk {
 35         void                    (*remap_pte)(pte_t *pte, unsigned long addr,
 36                                              struct vmemmap_remap_walk *walk);
 37         unsigned long           nr_walked;
 38         struct page             *reuse_page;
 39         unsigned long           reuse_addr;
 40         struct list_head        *vmemmap_pages;
 41 
 42 /* Skip the TLB flush when we split the PMD */
 43 #define VMEMMAP_SPLIT_NO_TLB_FLUSH      BIT(0)
 44 /* Skip the TLB flush when we remap the PTE */
 45 #define VMEMMAP_REMAP_NO_TLB_FLUSH      BIT(1)
 46         unsigned long           flags;
 47 };
 48 
 49 static int vmemmap_split_pmd(pmd_t *pmd, struct page *head, unsigned long start,
 50                              struct vmemmap_remap_walk *walk)
 51 {
 52         pmd_t __pmd;
 53         int i;
 54         unsigned long addr = start;
 55         pte_t *pgtable;
 56 
 57         pgtable = pte_alloc_one_kernel(&init_mm);
 58         if (!pgtable)
 59                 return -ENOMEM;
 60 
 61         pmd_populate_kernel(&init_mm, &__pmd, pgtable);
 62 
 63         for (i = 0; i < PTRS_PER_PTE; i++, addr += PAGE_SIZE) {
 64                 pte_t entry, *pte;
 65                 pgprot_t pgprot = PAGE_KERNEL;
 66 
 67                 entry = mk_pte(head + i, pgprot);
 68                 pte = pte_offset_kernel(&__pmd, addr);
 69                 set_pte_at(&init_mm, addr, pte, entry);
 70         }
 71 
 72         spin_lock(&init_mm.page_table_lock);
 73         if (likely(pmd_leaf(*pmd))) {
 74                 /*
 75                  * Higher order allocations from buddy allocator must be able to
 76                  * be treated as indepdenent small pages (as they can be freed
 77                  * individually).
 78                  */
 79                 if (!PageReserved(head))
 80                         split_page(head, get_order(PMD_SIZE));
 81 
 82                 /* Make pte visible before pmd. See comment in pmd_install(). */
 83                 smp_wmb();
 84                 pmd_populate_kernel(&init_mm, pmd, pgtable);
 85                 if (!(walk->flags & VMEMMAP_SPLIT_NO_TLB_FLUSH))
 86                         flush_tlb_kernel_range(start, start + PMD_SIZE);
 87         } else {
 88                 pte_free_kernel(&init_mm, pgtable);
 89         }
 90         spin_unlock(&init_mm.page_table_lock);
 91 
 92         return 0;
 93 }
 94 
 95 static int vmemmap_pmd_entry(pmd_t *pmd, unsigned long addr,
 96                              unsigned long next, struct mm_walk *walk)
 97 {
 98         int ret = 0;
 99         struct page *head;
100         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
101 
102         /* Only splitting, not remapping the vmemmap pages. */
103         if (!vmemmap_walk->remap_pte)
104                 walk->action = ACTION_CONTINUE;
105 
106         spin_lock(&init_mm.page_table_lock);
107         head = pmd_leaf(*pmd) ? pmd_page(*pmd) : NULL;
108         /*
109          * Due to HugeTLB alignment requirements and the vmemmap
110          * pages being at the start of the hotplugged memory
111          * region in memory_hotplug.memmap_on_memory case. Checking
112          * the vmemmap page associated with the first vmemmap page
113          * if it is self-hosted is sufficient.
114          *
115          * [                  hotplugged memory                  ]
116          * [        section        ][...][        section        ]
117          * [ vmemmap ][              usable memory               ]
118          *   ^  | ^                        |
119          *   +--+ |                        |
120          *        +------------------------+
121          */
122         if (IS_ENABLED(CONFIG_MEMORY_HOTPLUG) && unlikely(!vmemmap_walk->nr_walked)) {
123                 struct page *page = head ? head + pte_index(addr) :
124                                     pte_page(ptep_get(pte_offset_kernel(pmd, addr)));
125 
126                 if (PageVmemmapSelfHosted(page))
127                         ret = -ENOTSUPP;
128         }
129         spin_unlock(&init_mm.page_table_lock);
130         if (!head || ret)
131                 return ret;
132 
133         return vmemmap_split_pmd(pmd, head, addr & PMD_MASK, vmemmap_walk);
134 }
135 
136 static int vmemmap_pte_entry(pte_t *pte, unsigned long addr,
137                              unsigned long next, struct mm_walk *walk)
138 {
139         struct vmemmap_remap_walk *vmemmap_walk = walk->private;
140 
141         /*
142          * The reuse_page is found 'first' in page table walking before
143          * starting remapping.
144          */
145         if (!vmemmap_walk->reuse_page)
146                 vmemmap_walk->reuse_page = pte_page(ptep_get(pte));
147         else
148                 vmemmap_walk->remap_pte(pte, addr, vmemmap_walk);
149         vmemmap_walk->nr_walked++;
150 
151         return 0;
152 }
153 
154 static const struct mm_walk_ops vmemmap_remap_ops = {
155         .pmd_entry      = vmemmap_pmd_entry,
156         .pte_entry      = vmemmap_pte_entry,
157 };
158 
159 static int vmemmap_remap_range(unsigned long start, unsigned long end,
160                                struct vmemmap_remap_walk *walk)
161 {
162         int ret;
163 
164         VM_BUG_ON(!PAGE_ALIGNED(start | end));
165 
166         mmap_read_lock(&init_mm);
167         ret = walk_page_range_novma(&init_mm, start, end, &vmemmap_remap_ops,
168                                     NULL, walk);
169         mmap_read_unlock(&init_mm);
170         if (ret)
171                 return ret;
172 
173         if (walk->remap_pte && !(walk->flags & VMEMMAP_REMAP_NO_TLB_FLUSH))
174                 flush_tlb_kernel_range(start, end);
175 
176         return 0;
177 }
178 
179 /*
180  * Free a vmemmap page. A vmemmap page can be allocated from the memblock
181  * allocator or buddy allocator. If the PG_reserved flag is set, it means
182  * that it allocated from the memblock allocator, just free it via the
183  * free_bootmem_page(). Otherwise, use __free_page().
184  */
185 static inline void free_vmemmap_page(struct page *page)
186 {
187         if (PageReserved(page)) {
188                 free_bootmem_page(page);
189                 mod_node_page_state(page_pgdat(page), NR_MEMMAP_BOOT, -1);
190         } else {
191                 __free_page(page);
192                 mod_node_page_state(page_pgdat(page), NR_MEMMAP, -1);
193         }
194 }
195 
196 /* Free a list of the vmemmap pages */
197 static void free_vmemmap_page_list(struct list_head *list)
198 {
199         struct page *page, *next;
200 
201         list_for_each_entry_safe(page, next, list, lru)
202                 free_vmemmap_page(page);
203 }
204 
205 static void vmemmap_remap_pte(pte_t *pte, unsigned long addr,
206                               struct vmemmap_remap_walk *walk)
207 {
208         /*
209          * Remap the tail pages as read-only to catch illegal write operation
210          * to the tail pages.
211          */
212         pgprot_t pgprot = PAGE_KERNEL_RO;
213         struct page *page = pte_page(ptep_get(pte));
214         pte_t entry;
215 
216         /* Remapping the head page requires r/w */
217         if (unlikely(addr == walk->reuse_addr)) {
218                 pgprot = PAGE_KERNEL;
219                 list_del(&walk->reuse_page->lru);
220 
221                 /*
222                  * Makes sure that preceding stores to the page contents from
223                  * vmemmap_remap_free() become visible before the set_pte_at()
224                  * write.
225                  */
226                 smp_wmb();
227         }
228 
229         entry = mk_pte(walk->reuse_page, pgprot);
230         list_add(&page->lru, walk->vmemmap_pages);
231         set_pte_at(&init_mm, addr, pte, entry);
232 }
233 
234 /*
235  * How many struct page structs need to be reset. When we reuse the head
236  * struct page, the special metadata (e.g. page->flags or page->mapping)
237  * cannot copy to the tail struct page structs. The invalid value will be
238  * checked in the free_tail_page_prepare(). In order to avoid the message
239  * of "corrupted mapping in tail page". We need to reset at least 3 (one
240  * head struct page struct and two tail struct page structs) struct page
241  * structs.
242  */
243 #define NR_RESET_STRUCT_PAGE            3
244 
245 static inline void reset_struct_pages(struct page *start)
246 {
247         struct page *from = start + NR_RESET_STRUCT_PAGE;
248 
249         BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page));
250         memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE);
251 }
252 
253 static void vmemmap_restore_pte(pte_t *pte, unsigned long addr,
254                                 struct vmemmap_remap_walk *walk)
255 {
256         pgprot_t pgprot = PAGE_KERNEL;
257         struct page *page;
258         void *to;
259 
260         BUG_ON(pte_page(ptep_get(pte)) != walk->reuse_page);
261 
262         page = list_first_entry(walk->vmemmap_pages, struct page, lru);
263         list_del(&page->lru);
264         to = page_to_virt(page);
265         copy_page(to, (void *)walk->reuse_addr);
266         reset_struct_pages(to);
267 
268         /*
269          * Makes sure that preceding stores to the page contents become visible
270          * before the set_pte_at() write.
271          */
272         smp_wmb();
273         set_pte_at(&init_mm, addr, pte, mk_pte(page, pgprot));
274 }
275 
276 /**
277  * vmemmap_remap_split - split the vmemmap virtual address range [@start, @end)
278  *                      backing PMDs of the directmap into PTEs
279  * @start:     start address of the vmemmap virtual address range that we want
280  *             to remap.
281  * @end:       end address of the vmemmap virtual address range that we want to
282  *             remap.
283  * @reuse:     reuse address.
284  *
285  * Return: %0 on success, negative error code otherwise.
286  */
287 static int vmemmap_remap_split(unsigned long start, unsigned long end,
288                                unsigned long reuse)
289 {
290         struct vmemmap_remap_walk walk = {
291                 .remap_pte      = NULL,
292                 .flags          = VMEMMAP_SPLIT_NO_TLB_FLUSH,
293         };
294 
295         /* See the comment in the vmemmap_remap_free(). */
296         BUG_ON(start - reuse != PAGE_SIZE);
297 
298         return vmemmap_remap_range(reuse, end, &walk);
299 }
300 
301 /**
302  * vmemmap_remap_free - remap the vmemmap virtual address range [@start, @end)
303  *                      to the page which @reuse is mapped to, then free vmemmap
304  *                      which the range are mapped to.
305  * @start:      start address of the vmemmap virtual address range that we want
306  *              to remap.
307  * @end:        end address of the vmemmap virtual address range that we want to
308  *              remap.
309  * @reuse:      reuse address.
310  * @vmemmap_pages: list to deposit vmemmap pages to be freed.  It is callers
311  *              responsibility to free pages.
312  * @flags:      modifications to vmemmap_remap_walk flags
313  *
314  * Return: %0 on success, negative error code otherwise.
315  */
316 static int vmemmap_remap_free(unsigned long start, unsigned long end,
317                               unsigned long reuse,
318                               struct list_head *vmemmap_pages,
319                               unsigned long flags)
320 {
321         int ret;
322         struct vmemmap_remap_walk walk = {
323                 .remap_pte      = vmemmap_remap_pte,
324                 .reuse_addr     = reuse,
325                 .vmemmap_pages  = vmemmap_pages,
326                 .flags          = flags,
327         };
328         int nid = page_to_nid((struct page *)reuse);
329         gfp_t gfp_mask = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
330 
331         /*
332          * Allocate a new head vmemmap page to avoid breaking a contiguous
333          * block of struct page memory when freeing it back to page allocator
334          * in free_vmemmap_page_list(). This will allow the likely contiguous
335          * struct page backing memory to be kept contiguous and allowing for
336          * more allocations of hugepages. Fallback to the currently
337          * mapped head page in case should it fail to allocate.
338          */
339         walk.reuse_page = alloc_pages_node(nid, gfp_mask, 0);
340         if (walk.reuse_page) {
341                 copy_page(page_to_virt(walk.reuse_page),
342                           (void *)walk.reuse_addr);
343                 list_add(&walk.reuse_page->lru, vmemmap_pages);
344                 mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, 1);
345         }
346 
347         /*
348          * In order to make remapping routine most efficient for the huge pages,
349          * the routine of vmemmap page table walking has the following rules
350          * (see more details from the vmemmap_pte_range()):
351          *
352          * - The range [@start, @end) and the range [@reuse, @reuse + PAGE_SIZE)
353          *   should be continuous.
354          * - The @reuse address is part of the range [@reuse, @end) that we are
355          *   walking which is passed to vmemmap_remap_range().
356          * - The @reuse address is the first in the complete range.
357          *
358          * So we need to make sure that @start and @reuse meet the above rules.
359          */
360         BUG_ON(start - reuse != PAGE_SIZE);
361 
362         ret = vmemmap_remap_range(reuse, end, &walk);
363         if (ret && walk.nr_walked) {
364                 end = reuse + walk.nr_walked * PAGE_SIZE;
365                 /*
366                  * vmemmap_pages contains pages from the previous
367                  * vmemmap_remap_range call which failed.  These
368                  * are pages which were removed from the vmemmap.
369                  * They will be restored in the following call.
370                  */
371                 walk = (struct vmemmap_remap_walk) {
372                         .remap_pte      = vmemmap_restore_pte,
373                         .reuse_addr     = reuse,
374                         .vmemmap_pages  = vmemmap_pages,
375                         .flags          = 0,
376                 };
377 
378                 vmemmap_remap_range(reuse, end, &walk);
379         }
380 
381         return ret;
382 }
383 
384 static int alloc_vmemmap_page_list(unsigned long start, unsigned long end,
385                                    struct list_head *list)
386 {
387         gfp_t gfp_mask = GFP_KERNEL | __GFP_RETRY_MAYFAIL;
388         unsigned long nr_pages = (end - start) >> PAGE_SHIFT;
389         int nid = page_to_nid((struct page *)start);
390         struct page *page, *next;
391         int i;
392 
393         for (i = 0; i < nr_pages; i++) {
394                 page = alloc_pages_node(nid, gfp_mask, 0);
395                 if (!page) {
396                         mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, i);
397                         goto out;
398                 }
399                 list_add(&page->lru, list);
400         }
401 
402         mod_node_page_state(NODE_DATA(nid), NR_MEMMAP, nr_pages);
403 
404         return 0;
405 out:
406         list_for_each_entry_safe(page, next, list, lru)
407                 __free_page(page);
408         return -ENOMEM;
409 }
410 
411 /**
412  * vmemmap_remap_alloc - remap the vmemmap virtual address range [@start, end)
413  *                       to the page which is from the @vmemmap_pages
414  *                       respectively.
415  * @start:      start address of the vmemmap virtual address range that we want
416  *              to remap.
417  * @end:        end address of the vmemmap virtual address range that we want to
418  *              remap.
419  * @reuse:      reuse address.
420  * @flags:      modifications to vmemmap_remap_walk flags
421  *
422  * Return: %0 on success, negative error code otherwise.
423  */
424 static int vmemmap_remap_alloc(unsigned long start, unsigned long end,
425                                unsigned long reuse, unsigned long flags)
426 {
427         LIST_HEAD(vmemmap_pages);
428         struct vmemmap_remap_walk walk = {
429                 .remap_pte      = vmemmap_restore_pte,
430                 .reuse_addr     = reuse,
431                 .vmemmap_pages  = &vmemmap_pages,
432                 .flags          = flags,
433         };
434 
435         /* See the comment in the vmemmap_remap_free(). */
436         BUG_ON(start - reuse != PAGE_SIZE);
437 
438         if (alloc_vmemmap_page_list(start, end, &vmemmap_pages))
439                 return -ENOMEM;
440 
441         return vmemmap_remap_range(reuse, end, &walk);
442 }
443 
444 DEFINE_STATIC_KEY_FALSE(hugetlb_optimize_vmemmap_key);
445 EXPORT_SYMBOL(hugetlb_optimize_vmemmap_key);
446 
447 static bool vmemmap_optimize_enabled = IS_ENABLED(CONFIG_HUGETLB_PAGE_OPTIMIZE_VMEMMAP_DEFAULT_ON);
448 core_param(hugetlb_free_vmemmap, vmemmap_optimize_enabled, bool, 0);
449 
450 static int __hugetlb_vmemmap_restore_folio(const struct hstate *h,
451                                            struct folio *folio, unsigned long flags)
452 {
453         int ret;
454         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
455         unsigned long vmemmap_reuse;
456 
457         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
458         VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
459 
460         if (!folio_test_hugetlb_vmemmap_optimized(folio))
461                 return 0;
462 
463         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
464         vmemmap_reuse   = vmemmap_start;
465         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
466 
467         /*
468          * The pages which the vmemmap virtual address range [@vmemmap_start,
469          * @vmemmap_end) are mapped to are freed to the buddy allocator, and
470          * the range is mapped to the page which @vmemmap_reuse is mapped to.
471          * When a HugeTLB page is freed to the buddy allocator, previously
472          * discarded vmemmap pages must be allocated and remapping.
473          */
474         ret = vmemmap_remap_alloc(vmemmap_start, vmemmap_end, vmemmap_reuse, flags);
475         if (!ret) {
476                 folio_clear_hugetlb_vmemmap_optimized(folio);
477                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
478         }
479 
480         return ret;
481 }
482 
483 /**
484  * hugetlb_vmemmap_restore_folio - restore previously optimized (by
485  *                              hugetlb_vmemmap_optimize_folio()) vmemmap pages which
486  *                              will be reallocated and remapped.
487  * @h:          struct hstate.
488  * @folio:     the folio whose vmemmap pages will be restored.
489  *
490  * Return: %0 if @folio's vmemmap pages have been reallocated and remapped,
491  * negative error code otherwise.
492  */
493 int hugetlb_vmemmap_restore_folio(const struct hstate *h, struct folio *folio)
494 {
495         /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
496         synchronize_rcu();
497 
498         return __hugetlb_vmemmap_restore_folio(h, folio, 0);
499 }
500 
501 /**
502  * hugetlb_vmemmap_restore_folios - restore vmemmap for every folio on the list.
503  * @h:                  hstate.
504  * @folio_list:         list of folios.
505  * @non_hvo_folios:     Output list of folios for which vmemmap exists.
506  *
507  * Return: number of folios for which vmemmap was restored, or an error code
508  *              if an error was encountered restoring vmemmap for a folio.
509  *              Folios that have vmemmap are moved to the non_hvo_folios
510  *              list.  Processing of entries stops when the first error is
511  *              encountered. The folio that experienced the error and all
512  *              non-processed folios will remain on folio_list.
513  */
514 long hugetlb_vmemmap_restore_folios(const struct hstate *h,
515                                         struct list_head *folio_list,
516                                         struct list_head *non_hvo_folios)
517 {
518         struct folio *folio, *t_folio;
519         long restored = 0;
520         long ret = 0;
521 
522         /* avoid writes from page_ref_add_unless() while unfolding vmemmap */
523         synchronize_rcu();
524 
525         list_for_each_entry_safe(folio, t_folio, folio_list, lru) {
526                 if (folio_test_hugetlb_vmemmap_optimized(folio)) {
527                         ret = __hugetlb_vmemmap_restore_folio(h, folio,
528                                                               VMEMMAP_REMAP_NO_TLB_FLUSH);
529                         if (ret)
530                                 break;
531                         restored++;
532                 }
533 
534                 /* Add non-optimized folios to output list */
535                 list_move(&folio->lru, non_hvo_folios);
536         }
537 
538         if (restored)
539                 flush_tlb_all();
540         if (!ret)
541                 ret = restored;
542         return ret;
543 }
544 
545 /* Return true iff a HugeTLB whose vmemmap should and can be optimized. */
546 static bool vmemmap_should_optimize_folio(const struct hstate *h, struct folio *folio)
547 {
548         if (folio_test_hugetlb_vmemmap_optimized(folio))
549                 return false;
550 
551         if (!READ_ONCE(vmemmap_optimize_enabled))
552                 return false;
553 
554         if (!hugetlb_vmemmap_optimizable(h))
555                 return false;
556 
557         return true;
558 }
559 
560 static int __hugetlb_vmemmap_optimize_folio(const struct hstate *h,
561                                             struct folio *folio,
562                                             struct list_head *vmemmap_pages,
563                                             unsigned long flags)
564 {
565         int ret = 0;
566         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
567         unsigned long vmemmap_reuse;
568 
569         VM_WARN_ON_ONCE_FOLIO(!folio_test_hugetlb(folio), folio);
570         VM_WARN_ON_ONCE_FOLIO(folio_ref_count(folio), folio);
571 
572         if (!vmemmap_should_optimize_folio(h, folio))
573                 return ret;
574 
575         static_branch_inc(&hugetlb_optimize_vmemmap_key);
576         /*
577          * Very Subtle
578          * If VMEMMAP_REMAP_NO_TLB_FLUSH is set, TLB flushing is not performed
579          * immediately after remapping.  As a result, subsequent accesses
580          * and modifications to struct pages associated with the hugetlb
581          * page could be to the OLD struct pages.  Set the vmemmap optimized
582          * flag here so that it is copied to the new head page.  This keeps
583          * the old and new struct pages in sync.
584          * If there is an error during optimization, we will immediately FLUSH
585          * the TLB and clear the flag below.
586          */
587         folio_set_hugetlb_vmemmap_optimized(folio);
588 
589         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
590         vmemmap_reuse   = vmemmap_start;
591         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
592 
593         /*
594          * Remap the vmemmap virtual address range [@vmemmap_start, @vmemmap_end)
595          * to the page which @vmemmap_reuse is mapped to.  Add pages previously
596          * mapping the range to vmemmap_pages list so that they can be freed by
597          * the caller.
598          */
599         ret = vmemmap_remap_free(vmemmap_start, vmemmap_end, vmemmap_reuse,
600                                  vmemmap_pages, flags);
601         if (ret) {
602                 static_branch_dec(&hugetlb_optimize_vmemmap_key);
603                 folio_clear_hugetlb_vmemmap_optimized(folio);
604         }
605 
606         return ret;
607 }
608 
609 /**
610  * hugetlb_vmemmap_optimize_folio - optimize @folio's vmemmap pages.
611  * @h:          struct hstate.
612  * @folio:     the folio whose vmemmap pages will be optimized.
613  *
614  * This function only tries to optimize @folio's vmemmap pages and does not
615  * guarantee that the optimization will succeed after it returns. The caller
616  * can use folio_test_hugetlb_vmemmap_optimized(@folio) to detect if @folio's
617  * vmemmap pages have been optimized.
618  */
619 void hugetlb_vmemmap_optimize_folio(const struct hstate *h, struct folio *folio)
620 {
621         LIST_HEAD(vmemmap_pages);
622 
623         /* avoid writes from page_ref_add_unless() while folding vmemmap */
624         synchronize_rcu();
625 
626         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages, 0);
627         free_vmemmap_page_list(&vmemmap_pages);
628 }
629 
630 static int hugetlb_vmemmap_split_folio(const struct hstate *h, struct folio *folio)
631 {
632         unsigned long vmemmap_start = (unsigned long)&folio->page, vmemmap_end;
633         unsigned long vmemmap_reuse;
634 
635         if (!vmemmap_should_optimize_folio(h, folio))
636                 return 0;
637 
638         vmemmap_end     = vmemmap_start + hugetlb_vmemmap_size(h);
639         vmemmap_reuse   = vmemmap_start;
640         vmemmap_start   += HUGETLB_VMEMMAP_RESERVE_SIZE;
641 
642         /*
643          * Split PMDs on the vmemmap virtual address range [@vmemmap_start,
644          * @vmemmap_end]
645          */
646         return vmemmap_remap_split(vmemmap_start, vmemmap_end, vmemmap_reuse);
647 }
648 
649 void hugetlb_vmemmap_optimize_folios(struct hstate *h, struct list_head *folio_list)
650 {
651         struct folio *folio;
652         LIST_HEAD(vmemmap_pages);
653 
654         list_for_each_entry(folio, folio_list, lru) {
655                 int ret = hugetlb_vmemmap_split_folio(h, folio);
656 
657                 /*
658                  * Spliting the PMD requires allocating a page, thus lets fail
659                  * early once we encounter the first OOM. No point in retrying
660                  * as it can be dynamically done on remap with the memory
661                  * we get back from the vmemmap deduplication.
662                  */
663                 if (ret == -ENOMEM)
664                         break;
665         }
666 
667         flush_tlb_all();
668 
669         /* avoid writes from page_ref_add_unless() while folding vmemmap */
670         synchronize_rcu();
671 
672         list_for_each_entry(folio, folio_list, lru) {
673                 int ret;
674 
675                 ret = __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
676                                                        VMEMMAP_REMAP_NO_TLB_FLUSH);
677 
678                 /*
679                  * Pages to be freed may have been accumulated.  If we
680                  * encounter an ENOMEM,  free what we have and try again.
681                  * This can occur in the case that both spliting fails
682                  * halfway and head page allocation also failed. In this
683                  * case __hugetlb_vmemmap_optimize_folio() would free memory
684                  * allowing more vmemmap remaps to occur.
685                  */
686                 if (ret == -ENOMEM && !list_empty(&vmemmap_pages)) {
687                         flush_tlb_all();
688                         free_vmemmap_page_list(&vmemmap_pages);
689                         INIT_LIST_HEAD(&vmemmap_pages);
690                         __hugetlb_vmemmap_optimize_folio(h, folio, &vmemmap_pages,
691                                                          VMEMMAP_REMAP_NO_TLB_FLUSH);
692                 }
693         }
694 
695         flush_tlb_all();
696         free_vmemmap_page_list(&vmemmap_pages);
697 }
698 
699 static struct ctl_table hugetlb_vmemmap_sysctls[] = {
700         {
701                 .procname       = "hugetlb_optimize_vmemmap",
702                 .data           = &vmemmap_optimize_enabled,
703                 .maxlen         = sizeof(vmemmap_optimize_enabled),
704                 .mode           = 0644,
705                 .proc_handler   = proc_dobool,
706         },
707 };
708 
709 static int __init hugetlb_vmemmap_init(void)
710 {
711         const struct hstate *h;
712 
713         /* HUGETLB_VMEMMAP_RESERVE_SIZE should cover all used struct pages */
714         BUILD_BUG_ON(__NR_USED_SUBPAGE > HUGETLB_VMEMMAP_RESERVE_PAGES);
715 
716         for_each_hstate(h) {
717                 if (hugetlb_vmemmap_optimizable(h)) {
718                         register_sysctl_init("vm", hugetlb_vmemmap_sysctls);
719                         break;
720                 }
721         }
722         return 0;
723 }
724 late_initcall(hugetlb_vmemmap_init);
725 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php