~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/arch/powerpc/mm/book3s64/radix_pgtable.c

Version: ~ [ linux-6.11.5 ] ~ [ linux-6.10.14 ] ~ [ linux-6.9.12 ] ~ [ linux-6.8.12 ] ~ [ linux-6.7.12 ] ~ [ linux-6.6.58 ] ~ [ linux-6.5.13 ] ~ [ linux-6.4.16 ] ~ [ linux-6.3.13 ] ~ [ linux-6.2.16 ] ~ [ linux-6.1.114 ] ~ [ linux-6.0.19 ] ~ [ linux-5.19.17 ] ~ [ linux-5.18.19 ] ~ [ linux-5.17.15 ] ~ [ linux-5.16.20 ] ~ [ linux-5.15.169 ] ~ [ linux-5.14.21 ] ~ [ linux-5.13.19 ] ~ [ linux-5.12.19 ] ~ [ linux-5.11.22 ] ~ [ linux-5.10.228 ] ~ [ linux-5.9.16 ] ~ [ linux-5.8.18 ] ~ [ linux-5.7.19 ] ~ [ linux-5.6.19 ] ~ [ linux-5.5.19 ] ~ [ linux-5.4.284 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.322 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.336 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.337 ] ~ [ linux-4.4.302 ] ~ [ linux-3.10.108 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.9 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 // SPDX-License-Identifier: GPL-2.0-or-later
  2 /*
  3  * Page table handling routines for radix page table.
  4  *
  5  * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation.
  6  */
  7 
  8 #define pr_fmt(fmt) "radix-mmu: " fmt
  9 
 10 #include <linux/io.h>
 11 #include <linux/kernel.h>
 12 #include <linux/sched/mm.h>
 13 #include <linux/memblock.h>
 14 #include <linux/of.h>
 15 #include <linux/of_fdt.h>
 16 #include <linux/mm.h>
 17 #include <linux/hugetlb.h>
 18 #include <linux/string_helpers.h>
 19 #include <linux/memory.h>
 20 #include <linux/kfence.h>
 21 
 22 #include <asm/pgalloc.h>
 23 #include <asm/mmu_context.h>
 24 #include <asm/dma.h>
 25 #include <asm/machdep.h>
 26 #include <asm/mmu.h>
 27 #include <asm/firmware.h>
 28 #include <asm/powernv.h>
 29 #include <asm/sections.h>
 30 #include <asm/smp.h>
 31 #include <asm/trace.h>
 32 #include <asm/uaccess.h>
 33 #include <asm/ultravisor.h>
 34 #include <asm/set_memory.h>
 35 #include <asm/kfence.h>
 36 
 37 #include <trace/events/thp.h>
 38 
 39 #include <mm/mmu_decl.h>
 40 
 41 unsigned int mmu_base_pid;
 42 
 43 static __ref void *early_alloc_pgtable(unsigned long size, int nid,
 44                         unsigned long region_start, unsigned long region_end)
 45 {
 46         phys_addr_t min_addr = MEMBLOCK_LOW_LIMIT;
 47         phys_addr_t max_addr = MEMBLOCK_ALLOC_ANYWHERE;
 48         void *ptr;
 49 
 50         if (region_start)
 51                 min_addr = region_start;
 52         if (region_end)
 53                 max_addr = region_end;
 54 
 55         ptr = memblock_alloc_try_nid(size, size, min_addr, max_addr, nid);
 56 
 57         if (!ptr)
 58                 panic("%s: Failed to allocate %lu bytes align=0x%lx nid=%d from=%pa max_addr=%pa\n",
 59                       __func__, size, size, nid, &min_addr, &max_addr);
 60 
 61         return ptr;
 62 }
 63 
 64 /*
 65  * When allocating pud or pmd pointers, we allocate a complete page
 66  * of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
 67  * is to ensure that the page obtained from the memblock allocator
 68  * can be completely used as page table page and can be freed
 69  * correctly when the page table entries are removed.
 70  */
 71 static int early_map_kernel_page(unsigned long ea, unsigned long pa,
 72                           pgprot_t flags,
 73                           unsigned int map_page_size,
 74                           int nid,
 75                           unsigned long region_start, unsigned long region_end)
 76 {
 77         unsigned long pfn = pa >> PAGE_SHIFT;
 78         pgd_t *pgdp;
 79         p4d_t *p4dp;
 80         pud_t *pudp;
 81         pmd_t *pmdp;
 82         pte_t *ptep;
 83 
 84         pgdp = pgd_offset_k(ea);
 85         p4dp = p4d_offset(pgdp, ea);
 86         if (p4d_none(*p4dp)) {
 87                 pudp = early_alloc_pgtable(PAGE_SIZE, nid,
 88                                            region_start, region_end);
 89                 p4d_populate(&init_mm, p4dp, pudp);
 90         }
 91         pudp = pud_offset(p4dp, ea);
 92         if (map_page_size == PUD_SIZE) {
 93                 ptep = (pte_t *)pudp;
 94                 goto set_the_pte;
 95         }
 96         if (pud_none(*pudp)) {
 97                 pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
 98                                            region_end);
 99                 pud_populate(&init_mm, pudp, pmdp);
100         }
101         pmdp = pmd_offset(pudp, ea);
102         if (map_page_size == PMD_SIZE) {
103                 ptep = pmdp_ptep(pmdp);
104                 goto set_the_pte;
105         }
106         if (!pmd_present(*pmdp)) {
107                 ptep = early_alloc_pgtable(PAGE_SIZE, nid,
108                                                 region_start, region_end);
109                 pmd_populate_kernel(&init_mm, pmdp, ptep);
110         }
111         ptep = pte_offset_kernel(pmdp, ea);
112 
113 set_the_pte:
114         set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
115         asm volatile("ptesync": : :"memory");
116         return 0;
117 }
118 
119 /*
120  * nid, region_start, and region_end are hints to try to place the page
121  * table memory in the same node or region.
122  */
123 static int __map_kernel_page(unsigned long ea, unsigned long pa,
124                           pgprot_t flags,
125                           unsigned int map_page_size,
126                           int nid,
127                           unsigned long region_start, unsigned long region_end)
128 {
129         unsigned long pfn = pa >> PAGE_SHIFT;
130         pgd_t *pgdp;
131         p4d_t *p4dp;
132         pud_t *pudp;
133         pmd_t *pmdp;
134         pte_t *ptep;
135         /*
136          * Make sure task size is correct as per the max adddr
137          */
138         BUILD_BUG_ON(TASK_SIZE_USER64 > RADIX_PGTABLE_RANGE);
139 
140 #ifdef CONFIG_PPC_64K_PAGES
141         BUILD_BUG_ON(RADIX_KERN_MAP_SIZE != (1UL << MAX_EA_BITS_PER_CONTEXT));
142 #endif
143 
144         if (unlikely(!slab_is_available()))
145                 return early_map_kernel_page(ea, pa, flags, map_page_size,
146                                                 nid, region_start, region_end);
147 
148         /*
149          * Should make page table allocation functions be able to take a
150          * node, so we can place kernel page tables on the right nodes after
151          * boot.
152          */
153         pgdp = pgd_offset_k(ea);
154         p4dp = p4d_offset(pgdp, ea);
155         pudp = pud_alloc(&init_mm, p4dp, ea);
156         if (!pudp)
157                 return -ENOMEM;
158         if (map_page_size == PUD_SIZE) {
159                 ptep = (pte_t *)pudp;
160                 goto set_the_pte;
161         }
162         pmdp = pmd_alloc(&init_mm, pudp, ea);
163         if (!pmdp)
164                 return -ENOMEM;
165         if (map_page_size == PMD_SIZE) {
166                 ptep = pmdp_ptep(pmdp);
167                 goto set_the_pte;
168         }
169         ptep = pte_alloc_kernel(pmdp, ea);
170         if (!ptep)
171                 return -ENOMEM;
172 
173 set_the_pte:
174         set_pte_at(&init_mm, ea, ptep, pfn_pte(pfn, flags));
175         asm volatile("ptesync": : :"memory");
176         return 0;
177 }
178 
179 int radix__map_kernel_page(unsigned long ea, unsigned long pa,
180                           pgprot_t flags,
181                           unsigned int map_page_size)
182 {
183         return __map_kernel_page(ea, pa, flags, map_page_size, -1, 0, 0);
184 }
185 
186 #ifdef CONFIG_STRICT_KERNEL_RWX
187 static void radix__change_memory_range(unsigned long start, unsigned long end,
188                                        unsigned long clear)
189 {
190         unsigned long idx;
191         pgd_t *pgdp;
192         p4d_t *p4dp;
193         pud_t *pudp;
194         pmd_t *pmdp;
195         pte_t *ptep;
196 
197         start = ALIGN_DOWN(start, PAGE_SIZE);
198         end = PAGE_ALIGN(end); // aligns up
199 
200         pr_debug("Changing flags on range %lx-%lx removing 0x%lx\n",
201                  start, end, clear);
202 
203         for (idx = start; idx < end; idx += PAGE_SIZE) {
204                 pgdp = pgd_offset_k(idx);
205                 p4dp = p4d_offset(pgdp, idx);
206                 pudp = pud_alloc(&init_mm, p4dp, idx);
207                 if (!pudp)
208                         continue;
209                 if (pud_leaf(*pudp)) {
210                         ptep = (pte_t *)pudp;
211                         goto update_the_pte;
212                 }
213                 pmdp = pmd_alloc(&init_mm, pudp, idx);
214                 if (!pmdp)
215                         continue;
216                 if (pmd_leaf(*pmdp)) {
217                         ptep = pmdp_ptep(pmdp);
218                         goto update_the_pte;
219                 }
220                 ptep = pte_alloc_kernel(pmdp, idx);
221                 if (!ptep)
222                         continue;
223 update_the_pte:
224                 radix__pte_update(&init_mm, idx, ptep, clear, 0, 0);
225         }
226 
227         radix__flush_tlb_kernel_range(start, end);
228 }
229 
230 void radix__mark_rodata_ro(void)
231 {
232         unsigned long start, end;
233 
234         start = (unsigned long)_stext;
235         end = (unsigned long)__end_rodata;
236 
237         radix__change_memory_range(start, end, _PAGE_WRITE);
238 
239         for (start = PAGE_OFFSET; start < (unsigned long)_stext; start += PAGE_SIZE) {
240                 end = start + PAGE_SIZE;
241                 if (overlaps_interrupt_vector_text(start, end))
242                         radix__change_memory_range(start, end, _PAGE_WRITE);
243                 else
244                         break;
245         }
246 }
247 
248 void radix__mark_initmem_nx(void)
249 {
250         unsigned long start = (unsigned long)__init_begin;
251         unsigned long end = (unsigned long)__init_end;
252 
253         radix__change_memory_range(start, end, _PAGE_EXEC);
254 }
255 #endif /* CONFIG_STRICT_KERNEL_RWX */
256 
257 static inline void __meminit
258 print_mapping(unsigned long start, unsigned long end, unsigned long size, bool exec)
259 {
260         char buf[10];
261 
262         if (end <= start)
263                 return;
264 
265         string_get_size(size, 1, STRING_UNITS_2, buf, sizeof(buf));
266 
267         pr_info("Mapped 0x%016lx-0x%016lx with %s pages%s\n", start, end, buf,
268                 exec ? " (exec)" : "");
269 }
270 
271 static unsigned long next_boundary(unsigned long addr, unsigned long end)
272 {
273 #ifdef CONFIG_STRICT_KERNEL_RWX
274         unsigned long stext_phys;
275 
276         stext_phys = __pa_symbol(_stext);
277 
278         // Relocatable kernel running at non-zero real address
279         if (stext_phys != 0) {
280                 // The end of interrupts code at zero is a rodata boundary
281                 unsigned long end_intr = __pa_symbol(__end_interrupts) - stext_phys;
282                 if (addr < end_intr)
283                         return end_intr;
284 
285                 // Start of relocated kernel text is a rodata boundary
286                 if (addr < stext_phys)
287                         return stext_phys;
288         }
289 
290         if (addr < __pa_symbol(__srwx_boundary))
291                 return __pa_symbol(__srwx_boundary);
292 #endif
293         return end;
294 }
295 
296 static int __meminit create_physical_mapping(unsigned long start,
297                                              unsigned long end,
298                                              int nid, pgprot_t _prot,
299                                              unsigned long mapping_sz_limit)
300 {
301         unsigned long vaddr, addr, mapping_size = 0;
302         bool prev_exec, exec = false;
303         pgprot_t prot;
304         int psize;
305         unsigned long max_mapping_size = memory_block_size;
306 
307         if (mapping_sz_limit < max_mapping_size)
308                 max_mapping_size = mapping_sz_limit;
309 
310         if (debug_pagealloc_enabled())
311                 max_mapping_size = PAGE_SIZE;
312 
313         start = ALIGN(start, PAGE_SIZE);
314         end   = ALIGN_DOWN(end, PAGE_SIZE);
315         for (addr = start; addr < end; addr += mapping_size) {
316                 unsigned long gap, previous_size;
317                 int rc;
318 
319                 gap = next_boundary(addr, end) - addr;
320                 if (gap > max_mapping_size)
321                         gap = max_mapping_size;
322                 previous_size = mapping_size;
323                 prev_exec = exec;
324 
325                 if (IS_ALIGNED(addr, PUD_SIZE) && gap >= PUD_SIZE &&
326                     mmu_psize_defs[MMU_PAGE_1G].shift) {
327                         mapping_size = PUD_SIZE;
328                         psize = MMU_PAGE_1G;
329                 } else if (IS_ALIGNED(addr, PMD_SIZE) && gap >= PMD_SIZE &&
330                            mmu_psize_defs[MMU_PAGE_2M].shift) {
331                         mapping_size = PMD_SIZE;
332                         psize = MMU_PAGE_2M;
333                 } else {
334                         mapping_size = PAGE_SIZE;
335                         psize = mmu_virtual_psize;
336                 }
337 
338                 vaddr = (unsigned long)__va(addr);
339 
340                 if (overlaps_kernel_text(vaddr, vaddr + mapping_size) ||
341                     overlaps_interrupt_vector_text(vaddr, vaddr + mapping_size)) {
342                         prot = PAGE_KERNEL_X;
343                         exec = true;
344                 } else {
345                         prot = _prot;
346                         exec = false;
347                 }
348 
349                 if (mapping_size != previous_size || exec != prev_exec) {
350                         print_mapping(start, addr, previous_size, prev_exec);
351                         start = addr;
352                 }
353 
354                 rc = __map_kernel_page(vaddr, addr, prot, mapping_size, nid, start, end);
355                 if (rc)
356                         return rc;
357 
358                 update_page_count(psize, 1);
359         }
360 
361         print_mapping(start, addr, mapping_size, exec);
362         return 0;
363 }
364 
365 #ifdef CONFIG_KFENCE
366 static bool __ro_after_init kfence_early_init = !!CONFIG_KFENCE_SAMPLE_INTERVAL;
367 
368 static int __init parse_kfence_early_init(char *arg)
369 {
370         int val;
371 
372         if (get_option(&arg, &val))
373                 kfence_early_init = !!val;
374         return 0;
375 }
376 early_param("kfence.sample_interval", parse_kfence_early_init);
377 
378 static inline phys_addr_t alloc_kfence_pool(void)
379 {
380         phys_addr_t kfence_pool;
381 
382         /*
383          * TODO: Support to enable KFENCE after bootup depends on the ability to
384          *       split page table mappings. As such support is not currently
385          *       implemented for radix pagetables, support enabling KFENCE
386          *       only at system startup for now.
387          *
388          *       After support for splitting mappings is available on radix,
389          *       alloc_kfence_pool() & map_kfence_pool() can be dropped and
390          *       mapping for __kfence_pool memory can be
391          *       split during arch_kfence_init_pool().
392          */
393         if (!kfence_early_init)
394                 goto no_kfence;
395 
396         kfence_pool = memblock_phys_alloc(KFENCE_POOL_SIZE, PAGE_SIZE);
397         if (!kfence_pool)
398                 goto no_kfence;
399 
400         memblock_mark_nomap(kfence_pool, KFENCE_POOL_SIZE);
401         return kfence_pool;
402 
403 no_kfence:
404         disable_kfence();
405         return 0;
406 }
407 
408 static inline void map_kfence_pool(phys_addr_t kfence_pool)
409 {
410         if (!kfence_pool)
411                 return;
412 
413         if (create_physical_mapping(kfence_pool, kfence_pool + KFENCE_POOL_SIZE,
414                                     -1, PAGE_KERNEL, PAGE_SIZE))
415                 goto err;
416 
417         memblock_clear_nomap(kfence_pool, KFENCE_POOL_SIZE);
418         __kfence_pool = __va(kfence_pool);
419         return;
420 
421 err:
422         memblock_phys_free(kfence_pool, KFENCE_POOL_SIZE);
423         disable_kfence();
424 }
425 #else
426 static inline phys_addr_t alloc_kfence_pool(void) { return 0; }
427 static inline void map_kfence_pool(phys_addr_t kfence_pool) { }
428 #endif
429 
430 static void __init radix_init_pgtable(void)
431 {
432         phys_addr_t kfence_pool;
433         unsigned long rts_field;
434         phys_addr_t start, end;
435         u64 i;
436 
437         /* We don't support slb for radix */
438         slb_set_size(0);
439 
440         kfence_pool = alloc_kfence_pool();
441 
442         /*
443          * Create the linear mapping
444          */
445         for_each_mem_range(i, &start, &end) {
446                 /*
447                  * The memblock allocator  is up at this point, so the
448                  * page tables will be allocated within the range. No
449                  * need or a node (which we don't have yet).
450                  */
451 
452                 if (end >= RADIX_VMALLOC_START) {
453                         pr_warn("Outside the supported range\n");
454                         continue;
455                 }
456 
457                 WARN_ON(create_physical_mapping(start, end,
458                                                 -1, PAGE_KERNEL, ~0UL));
459         }
460 
461         map_kfence_pool(kfence_pool);
462 
463         if (!cpu_has_feature(CPU_FTR_HVMODE) &&
464                         cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) {
465                 /*
466                  * Older versions of KVM on these machines prefer if the
467                  * guest only uses the low 19 PID bits.
468                  */
469                 mmu_pid_bits = 19;
470         }
471         mmu_base_pid = 1;
472 
473         /*
474          * Allocate Partition table and process table for the
475          * host.
476          */
477         BUG_ON(PRTB_SIZE_SHIFT > 36);
478         process_tb = early_alloc_pgtable(1UL << PRTB_SIZE_SHIFT, -1, 0, 0);
479         /*
480          * Fill in the process table.
481          */
482         rts_field = radix__get_tree_size();
483         process_tb->prtb0 = cpu_to_be64(rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE);
484 
485         /*
486          * The init_mm context is given the first available (non-zero) PID,
487          * which is the "guard PID" and contains no page table. PIDR should
488          * never be set to zero because that duplicates the kernel address
489          * space at the 0x0... offset (quadrant 0)!
490          *
491          * An arbitrary PID that may later be allocated by the PID allocator
492          * for userspace processes must not be used either, because that
493          * would cause stale user mappings for that PID on CPUs outside of
494          * the TLB invalidation scheme (because it won't be in mm_cpumask).
495          *
496          * So permanently carve out one PID for the purpose of a guard PID.
497          */
498         init_mm.context.id = mmu_base_pid;
499         mmu_base_pid++;
500 }
501 
502 static void __init radix_init_partition_table(void)
503 {
504         unsigned long rts_field, dw0, dw1;
505 
506         mmu_partition_table_init();
507         rts_field = radix__get_tree_size();
508         dw0 = rts_field | __pa(init_mm.pgd) | RADIX_PGD_INDEX_SIZE | PATB_HR;
509         dw1 = __pa(process_tb) | (PRTB_SIZE_SHIFT - 12) | PATB_GR;
510         mmu_partition_table_set_entry(0, dw0, dw1, false);
511 
512         pr_info("Initializing Radix MMU\n");
513 }
514 
515 static int __init get_idx_from_shift(unsigned int shift)
516 {
517         int idx = -1;
518 
519         switch (shift) {
520         case 0xc:
521                 idx = MMU_PAGE_4K;
522                 break;
523         case 0x10:
524                 idx = MMU_PAGE_64K;
525                 break;
526         case 0x15:
527                 idx = MMU_PAGE_2M;
528                 break;
529         case 0x1e:
530                 idx = MMU_PAGE_1G;
531                 break;
532         }
533         return idx;
534 }
535 
536 static int __init radix_dt_scan_page_sizes(unsigned long node,
537                                            const char *uname, int depth,
538                                            void *data)
539 {
540         int size = 0;
541         int shift, idx;
542         unsigned int ap;
543         const __be32 *prop;
544         const char *type = of_get_flat_dt_prop(node, "device_type", NULL);
545 
546         /* We are scanning "cpu" nodes only */
547         if (type == NULL || strcmp(type, "cpu") != 0)
548                 return 0;
549 
550         /* Grab page size encodings */
551         prop = of_get_flat_dt_prop(node, "ibm,processor-radix-AP-encodings", &size);
552         if (!prop)
553                 return 0;
554 
555         pr_info("Page sizes from device-tree:\n");
556         for (; size >= 4; size -= 4, ++prop) {
557 
558                 struct mmu_psize_def *def;
559 
560                 /* top 3 bit is AP encoding */
561                 shift = be32_to_cpu(prop[0]) & ~(0xe << 28);
562                 ap = be32_to_cpu(prop[0]) >> 29;
563                 pr_info("Page size shift = %d AP=0x%x\n", shift, ap);
564 
565                 idx = get_idx_from_shift(shift);
566                 if (idx < 0)
567                         continue;
568 
569                 def = &mmu_psize_defs[idx];
570                 def->shift = shift;
571                 def->ap  = ap;
572                 def->h_rpt_pgsize = psize_to_rpti_pgsize(idx);
573         }
574 
575         /* needed ? */
576         cur_cpu_spec->mmu_features &= ~MMU_FTR_NO_SLBIE_B;
577         return 1;
578 }
579 
580 void __init radix__early_init_devtree(void)
581 {
582         int rc;
583 
584         /*
585          * Try to find the available page sizes in the device-tree
586          */
587         rc = of_scan_flat_dt(radix_dt_scan_page_sizes, NULL);
588         if (!rc) {
589                 /*
590                  * No page size details found in device tree.
591                  * Let's assume we have page 4k and 64k support
592                  */
593                 mmu_psize_defs[MMU_PAGE_4K].shift = 12;
594                 mmu_psize_defs[MMU_PAGE_4K].ap = 0x0;
595                 mmu_psize_defs[MMU_PAGE_4K].h_rpt_pgsize =
596                         psize_to_rpti_pgsize(MMU_PAGE_4K);
597 
598                 mmu_psize_defs[MMU_PAGE_64K].shift = 16;
599                 mmu_psize_defs[MMU_PAGE_64K].ap = 0x5;
600                 mmu_psize_defs[MMU_PAGE_64K].h_rpt_pgsize =
601                         psize_to_rpti_pgsize(MMU_PAGE_64K);
602         }
603         return;
604 }
605 
606 void __init radix__early_init_mmu(void)
607 {
608         unsigned long lpcr;
609 
610 #ifdef CONFIG_PPC_64S_HASH_MMU
611 #ifdef CONFIG_PPC_64K_PAGES
612         /* PAGE_SIZE mappings */
613         mmu_virtual_psize = MMU_PAGE_64K;
614 #else
615         mmu_virtual_psize = MMU_PAGE_4K;
616 #endif
617 #endif
618         /*
619          * initialize page table size
620          */
621         __pte_index_size = RADIX_PTE_INDEX_SIZE;
622         __pmd_index_size = RADIX_PMD_INDEX_SIZE;
623         __pud_index_size = RADIX_PUD_INDEX_SIZE;
624         __pgd_index_size = RADIX_PGD_INDEX_SIZE;
625         __pud_cache_index = RADIX_PUD_INDEX_SIZE;
626         __pte_table_size = RADIX_PTE_TABLE_SIZE;
627         __pmd_table_size = RADIX_PMD_TABLE_SIZE;
628         __pud_table_size = RADIX_PUD_TABLE_SIZE;
629         __pgd_table_size = RADIX_PGD_TABLE_SIZE;
630 
631         __pmd_val_bits = RADIX_PMD_VAL_BITS;
632         __pud_val_bits = RADIX_PUD_VAL_BITS;
633         __pgd_val_bits = RADIX_PGD_VAL_BITS;
634 
635         __kernel_virt_start = RADIX_KERN_VIRT_START;
636         __vmalloc_start = RADIX_VMALLOC_START;
637         __vmalloc_end = RADIX_VMALLOC_END;
638         __kernel_io_start = RADIX_KERN_IO_START;
639         __kernel_io_end = RADIX_KERN_IO_END;
640         vmemmap = (struct page *)RADIX_VMEMMAP_START;
641         ioremap_bot = IOREMAP_BASE;
642 
643 #ifdef CONFIG_PCI
644         pci_io_base = ISA_IO_BASE;
645 #endif
646         __pte_frag_nr = RADIX_PTE_FRAG_NR;
647         __pte_frag_size_shift = RADIX_PTE_FRAG_SIZE_SHIFT;
648         __pmd_frag_nr = RADIX_PMD_FRAG_NR;
649         __pmd_frag_size_shift = RADIX_PMD_FRAG_SIZE_SHIFT;
650 
651         radix_init_pgtable();
652 
653         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
654                 lpcr = mfspr(SPRN_LPCR);
655                 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
656                 radix_init_partition_table();
657         } else {
658                 radix_init_pseries();
659         }
660 
661         memblock_set_current_limit(MEMBLOCK_ALLOC_ANYWHERE);
662 
663         /* Switch to the guard PID before turning on MMU */
664         radix__switch_mmu_context(NULL, &init_mm);
665         tlbiel_all();
666 }
667 
668 void radix__early_init_mmu_secondary(void)
669 {
670         unsigned long lpcr;
671         /*
672          * update partition table control register and UPRT
673          */
674         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
675                 lpcr = mfspr(SPRN_LPCR);
676                 mtspr(SPRN_LPCR, lpcr | LPCR_UPRT | LPCR_HR);
677 
678                 set_ptcr_when_no_uv(__pa(partition_tb) |
679                                     (PATB_SIZE_SHIFT - 12));
680         }
681 
682         radix__switch_mmu_context(NULL, &init_mm);
683         tlbiel_all();
684 
685         /* Make sure userspace can't change the AMR */
686         mtspr(SPRN_UAMOR, 0);
687 }
688 
689 /* Called during kexec sequence with MMU off */
690 notrace void radix__mmu_cleanup_all(void)
691 {
692         unsigned long lpcr;
693 
694         if (!firmware_has_feature(FW_FEATURE_LPAR)) {
695                 lpcr = mfspr(SPRN_LPCR);
696                 mtspr(SPRN_LPCR, lpcr & ~LPCR_UPRT);
697                 set_ptcr_when_no_uv(0);
698                 powernv_set_nmmu_ptcr(0);
699                 radix__flush_tlb_all();
700         }
701 }
702 
703 #ifdef CONFIG_MEMORY_HOTPLUG
704 static void free_pte_table(pte_t *pte_start, pmd_t *pmd)
705 {
706         pte_t *pte;
707         int i;
708 
709         for (i = 0; i < PTRS_PER_PTE; i++) {
710                 pte = pte_start + i;
711                 if (!pte_none(*pte))
712                         return;
713         }
714 
715         pte_free_kernel(&init_mm, pte_start);
716         pmd_clear(pmd);
717 }
718 
719 static void free_pmd_table(pmd_t *pmd_start, pud_t *pud)
720 {
721         pmd_t *pmd;
722         int i;
723 
724         for (i = 0; i < PTRS_PER_PMD; i++) {
725                 pmd = pmd_start + i;
726                 if (!pmd_none(*pmd))
727                         return;
728         }
729 
730         pmd_free(&init_mm, pmd_start);
731         pud_clear(pud);
732 }
733 
734 static void free_pud_table(pud_t *pud_start, p4d_t *p4d)
735 {
736         pud_t *pud;
737         int i;
738 
739         for (i = 0; i < PTRS_PER_PUD; i++) {
740                 pud = pud_start + i;
741                 if (!pud_none(*pud))
742                         return;
743         }
744 
745         pud_free(&init_mm, pud_start);
746         p4d_clear(p4d);
747 }
748 
749 #ifdef CONFIG_SPARSEMEM_VMEMMAP
750 static bool __meminit vmemmap_pmd_is_unused(unsigned long addr, unsigned long end)
751 {
752         unsigned long start = ALIGN_DOWN(addr, PMD_SIZE);
753 
754         return !vmemmap_populated(start, PMD_SIZE);
755 }
756 
757 static bool __meminit vmemmap_page_is_unused(unsigned long addr, unsigned long end)
758 {
759         unsigned long start = ALIGN_DOWN(addr, PAGE_SIZE);
760 
761         return !vmemmap_populated(start, PAGE_SIZE);
762 
763 }
764 #endif
765 
766 static void __meminit free_vmemmap_pages(struct page *page,
767                                          struct vmem_altmap *altmap,
768                                          int order)
769 {
770         unsigned int nr_pages = 1 << order;
771 
772         if (altmap) {
773                 unsigned long alt_start, alt_end;
774                 unsigned long base_pfn = page_to_pfn(page);
775 
776                 /*
777                  * with 2M vmemmap mmaping we can have things setup
778                  * such that even though atlmap is specified we never
779                  * used altmap.
780                  */
781                 alt_start = altmap->base_pfn;
782                 alt_end = altmap->base_pfn + altmap->reserve + altmap->free;
783 
784                 if (base_pfn >= alt_start && base_pfn < alt_end) {
785                         vmem_altmap_free(altmap, nr_pages);
786                         return;
787                 }
788         }
789 
790         if (PageReserved(page)) {
791                 /* allocated from memblock */
792                 while (nr_pages--)
793                         free_reserved_page(page++);
794         } else
795                 free_pages((unsigned long)page_address(page), order);
796 }
797 
798 static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr,
799                                        unsigned long end, bool direct,
800                                        struct vmem_altmap *altmap)
801 {
802         unsigned long next, pages = 0;
803         pte_t *pte;
804 
805         pte = pte_start + pte_index(addr);
806         for (; addr < end; addr = next, pte++) {
807                 next = (addr + PAGE_SIZE) & PAGE_MASK;
808                 if (next > end)
809                         next = end;
810 
811                 if (!pte_present(*pte))
812                         continue;
813 
814                 if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
815                         if (!direct)
816                                 free_vmemmap_pages(pte_page(*pte), altmap, 0);
817                         pte_clear(&init_mm, addr, pte);
818                         pages++;
819                 }
820 #ifdef CONFIG_SPARSEMEM_VMEMMAP
821                 else if (!direct && vmemmap_page_is_unused(addr, next)) {
822                         free_vmemmap_pages(pte_page(*pte), altmap, 0);
823                         pte_clear(&init_mm, addr, pte);
824                 }
825 #endif
826         }
827         if (direct)
828                 update_page_count(mmu_virtual_psize, -pages);
829 }
830 
831 static void __meminit remove_pmd_table(pmd_t *pmd_start, unsigned long addr,
832                                        unsigned long end, bool direct,
833                                        struct vmem_altmap *altmap)
834 {
835         unsigned long next, pages = 0;
836         pte_t *pte_base;
837         pmd_t *pmd;
838 
839         pmd = pmd_start + pmd_index(addr);
840         for (; addr < end; addr = next, pmd++) {
841                 next = pmd_addr_end(addr, end);
842 
843                 if (!pmd_present(*pmd))
844                         continue;
845 
846                 if (pmd_leaf(*pmd)) {
847                         if (IS_ALIGNED(addr, PMD_SIZE) &&
848                             IS_ALIGNED(next, PMD_SIZE)) {
849                                 if (!direct)
850                                         free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
851                                 pte_clear(&init_mm, addr, (pte_t *)pmd);
852                                 pages++;
853                         }
854 #ifdef CONFIG_SPARSEMEM_VMEMMAP
855                         else if (!direct && vmemmap_pmd_is_unused(addr, next)) {
856                                 free_vmemmap_pages(pmd_page(*pmd), altmap, get_order(PMD_SIZE));
857                                 pte_clear(&init_mm, addr, (pte_t *)pmd);
858                         }
859 #endif
860                         continue;
861                 }
862 
863                 pte_base = (pte_t *)pmd_page_vaddr(*pmd);
864                 remove_pte_table(pte_base, addr, next, direct, altmap);
865                 free_pte_table(pte_base, pmd);
866         }
867         if (direct)
868                 update_page_count(MMU_PAGE_2M, -pages);
869 }
870 
871 static void __meminit remove_pud_table(pud_t *pud_start, unsigned long addr,
872                                        unsigned long end, bool direct,
873                                        struct vmem_altmap *altmap)
874 {
875         unsigned long next, pages = 0;
876         pmd_t *pmd_base;
877         pud_t *pud;
878 
879         pud = pud_start + pud_index(addr);
880         for (; addr < end; addr = next, pud++) {
881                 next = pud_addr_end(addr, end);
882 
883                 if (!pud_present(*pud))
884                         continue;
885 
886                 if (pud_leaf(*pud)) {
887                         if (!IS_ALIGNED(addr, PUD_SIZE) ||
888                             !IS_ALIGNED(next, PUD_SIZE)) {
889                                 WARN_ONCE(1, "%s: unaligned range\n", __func__);
890                                 continue;
891                         }
892                         pte_clear(&init_mm, addr, (pte_t *)pud);
893                         pages++;
894                         continue;
895                 }
896 
897                 pmd_base = pud_pgtable(*pud);
898                 remove_pmd_table(pmd_base, addr, next, direct, altmap);
899                 free_pmd_table(pmd_base, pud);
900         }
901         if (direct)
902                 update_page_count(MMU_PAGE_1G, -pages);
903 }
904 
905 static void __meminit
906 remove_pagetable(unsigned long start, unsigned long end, bool direct,
907                  struct vmem_altmap *altmap)
908 {
909         unsigned long addr, next;
910         pud_t *pud_base;
911         pgd_t *pgd;
912         p4d_t *p4d;
913 
914         spin_lock(&init_mm.page_table_lock);
915 
916         for (addr = start; addr < end; addr = next) {
917                 next = pgd_addr_end(addr, end);
918 
919                 pgd = pgd_offset_k(addr);
920                 p4d = p4d_offset(pgd, addr);
921                 if (!p4d_present(*p4d))
922                         continue;
923 
924                 if (p4d_leaf(*p4d)) {
925                         if (!IS_ALIGNED(addr, P4D_SIZE) ||
926                             !IS_ALIGNED(next, P4D_SIZE)) {
927                                 WARN_ONCE(1, "%s: unaligned range\n", __func__);
928                                 continue;
929                         }
930 
931                         pte_clear(&init_mm, addr, (pte_t *)pgd);
932                         continue;
933                 }
934 
935                 pud_base = p4d_pgtable(*p4d);
936                 remove_pud_table(pud_base, addr, next, direct, altmap);
937                 free_pud_table(pud_base, p4d);
938         }
939 
940         spin_unlock(&init_mm.page_table_lock);
941         radix__flush_tlb_kernel_range(start, end);
942 }
943 
944 int __meminit radix__create_section_mapping(unsigned long start,
945                                             unsigned long end, int nid,
946                                             pgprot_t prot)
947 {
948         if (end >= RADIX_VMALLOC_START) {
949                 pr_warn("Outside the supported range\n");
950                 return -1;
951         }
952 
953         return create_physical_mapping(__pa(start), __pa(end),
954                                        nid, prot, ~0UL);
955 }
956 
957 int __meminit radix__remove_section_mapping(unsigned long start, unsigned long end)
958 {
959         remove_pagetable(start, end, true, NULL);
960         return 0;
961 }
962 #endif /* CONFIG_MEMORY_HOTPLUG */
963 
964 #ifdef CONFIG_SPARSEMEM_VMEMMAP
965 static int __map_kernel_page_nid(unsigned long ea, unsigned long pa,
966                                  pgprot_t flags, unsigned int map_page_size,
967                                  int nid)
968 {
969         return __map_kernel_page(ea, pa, flags, map_page_size, nid, 0, 0);
970 }
971 
972 int __meminit radix__vmemmap_create_mapping(unsigned long start,
973                                       unsigned long page_size,
974                                       unsigned long phys)
975 {
976         /* Create a PTE encoding */
977         int nid = early_pfn_to_nid(phys >> PAGE_SHIFT);
978         int ret;
979 
980         if ((start + page_size) >= RADIX_VMEMMAP_END) {
981                 pr_warn("Outside the supported range\n");
982                 return -1;
983         }
984 
985         ret = __map_kernel_page_nid(start, phys, PAGE_KERNEL, page_size, nid);
986         BUG_ON(ret);
987 
988         return 0;
989 }
990 
991 
992 bool vmemmap_can_optimize(struct vmem_altmap *altmap, struct dev_pagemap *pgmap)
993 {
994         if (radix_enabled())
995                 return __vmemmap_can_optimize(altmap, pgmap);
996 
997         return false;
998 }
999 
1000 int __meminit vmemmap_check_pmd(pmd_t *pmdp, int node,
1001                                 unsigned long addr, unsigned long next)
1002 {
1003         int large = pmd_leaf(*pmdp);
1004 
1005         if (large)
1006                 vmemmap_verify(pmdp_ptep(pmdp), node, addr, next);
1007 
1008         return large;
1009 }
1010 
1011 void __meminit vmemmap_set_pmd(pmd_t *pmdp, void *p, int node,
1012                                unsigned long addr, unsigned long next)
1013 {
1014         pte_t entry;
1015         pte_t *ptep = pmdp_ptep(pmdp);
1016 
1017         VM_BUG_ON(!IS_ALIGNED(addr, PMD_SIZE));
1018         entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1019         set_pte_at(&init_mm, addr, ptep, entry);
1020         asm volatile("ptesync": : :"memory");
1021 
1022         vmemmap_verify(ptep, node, addr, next);
1023 }
1024 
1025 static pte_t * __meminit radix__vmemmap_pte_populate(pmd_t *pmdp, unsigned long addr,
1026                                                      int node,
1027                                                      struct vmem_altmap *altmap,
1028                                                      struct page *reuse)
1029 {
1030         pte_t *pte = pte_offset_kernel(pmdp, addr);
1031 
1032         if (pte_none(*pte)) {
1033                 pte_t entry;
1034                 void *p;
1035 
1036                 if (!reuse) {
1037                         /*
1038                          * make sure we don't create altmap mappings
1039                          * covering things outside the device.
1040                          */
1041                         if (altmap && altmap_cross_boundary(altmap, addr, PAGE_SIZE))
1042                                 altmap = NULL;
1043 
1044                         p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
1045                         if (!p && altmap)
1046                                 p = vmemmap_alloc_block_buf(PAGE_SIZE, node, NULL);
1047                         if (!p)
1048                                 return NULL;
1049                         pr_debug("PAGE_SIZE vmemmap mapping\n");
1050                 } else {
1051                         /*
1052                          * When a PTE/PMD entry is freed from the init_mm
1053                          * there's a free_pages() call to this page allocated
1054                          * above. Thus this get_page() is paired with the
1055                          * put_page_testzero() on the freeing path.
1056                          * This can only called by certain ZONE_DEVICE path,
1057                          * and through vmemmap_populate_compound_pages() when
1058                          * slab is available.
1059                          */
1060                         get_page(reuse);
1061                         p = page_to_virt(reuse);
1062                         pr_debug("Tail page reuse vmemmap mapping\n");
1063                 }
1064 
1065                 VM_BUG_ON(!PAGE_ALIGNED(addr));
1066                 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
1067                 set_pte_at(&init_mm, addr, pte, entry);
1068                 asm volatile("ptesync": : :"memory");
1069         }
1070         return pte;
1071 }
1072 
1073 static inline pud_t *vmemmap_pud_alloc(p4d_t *p4dp, int node,
1074                                        unsigned long address)
1075 {
1076         pud_t *pud;
1077 
1078         /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1079         if (unlikely(p4d_none(*p4dp))) {
1080                 if (unlikely(!slab_is_available())) {
1081                         pud = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1082                         p4d_populate(&init_mm, p4dp, pud);
1083                         /* go to the pud_offset */
1084                 } else
1085                         return pud_alloc(&init_mm, p4dp, address);
1086         }
1087         return pud_offset(p4dp, address);
1088 }
1089 
1090 static inline pmd_t *vmemmap_pmd_alloc(pud_t *pudp, int node,
1091                                        unsigned long address)
1092 {
1093         pmd_t *pmd;
1094 
1095         /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1096         if (unlikely(pud_none(*pudp))) {
1097                 if (unlikely(!slab_is_available())) {
1098                         pmd = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1099                         pud_populate(&init_mm, pudp, pmd);
1100                 } else
1101                         return pmd_alloc(&init_mm, pudp, address);
1102         }
1103         return pmd_offset(pudp, address);
1104 }
1105 
1106 static inline pte_t *vmemmap_pte_alloc(pmd_t *pmdp, int node,
1107                                        unsigned long address)
1108 {
1109         pte_t *pte;
1110 
1111         /* All early vmemmap mapping to keep simple do it at PAGE_SIZE */
1112         if (unlikely(pmd_none(*pmdp))) {
1113                 if (unlikely(!slab_is_available())) {
1114                         pte = early_alloc_pgtable(PAGE_SIZE, node, 0, 0);
1115                         pmd_populate(&init_mm, pmdp, pte);
1116                 } else
1117                         return pte_alloc_kernel(pmdp, address);
1118         }
1119         return pte_offset_kernel(pmdp, address);
1120 }
1121 
1122 
1123 
1124 int __meminit radix__vmemmap_populate(unsigned long start, unsigned long end, int node,
1125                                       struct vmem_altmap *altmap)
1126 {
1127         unsigned long addr;
1128         unsigned long next;
1129         pgd_t *pgd;
1130         p4d_t *p4d;
1131         pud_t *pud;
1132         pmd_t *pmd;
1133         pte_t *pte;
1134 
1135         for (addr = start; addr < end; addr = next) {
1136                 next = pmd_addr_end(addr, end);
1137 
1138                 pgd = pgd_offset_k(addr);
1139                 p4d = p4d_offset(pgd, addr);
1140                 pud = vmemmap_pud_alloc(p4d, node, addr);
1141                 if (!pud)
1142                         return -ENOMEM;
1143                 pmd = vmemmap_pmd_alloc(pud, node, addr);
1144                 if (!pmd)
1145                         return -ENOMEM;
1146 
1147                 if (pmd_none(READ_ONCE(*pmd))) {
1148                         void *p;
1149 
1150                         /*
1151                          * keep it simple by checking addr PMD_SIZE alignment
1152                          * and verifying the device boundary condition.
1153                          * For us to use a pmd mapping, both addr and pfn should
1154                          * be aligned. We skip if addr is not aligned and for
1155                          * pfn we hope we have extra area in the altmap that
1156                          * can help to find an aligned block. This can result
1157                          * in altmap block allocation failures, in which case
1158                          * we fallback to RAM for vmemmap allocation.
1159                          */
1160                         if (altmap && (!IS_ALIGNED(addr, PMD_SIZE) ||
1161                                        altmap_cross_boundary(altmap, addr, PMD_SIZE))) {
1162                                 /*
1163                                  * make sure we don't create altmap mappings
1164                                  * covering things outside the device.
1165                                  */
1166                                 goto base_mapping;
1167                         }
1168 
1169                         p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap);
1170                         if (p) {
1171                                 vmemmap_set_pmd(pmd, p, node, addr, next);
1172                                 pr_debug("PMD_SIZE vmemmap mapping\n");
1173                                 continue;
1174                         } else if (altmap) {
1175                                 /*
1176                                  * A vmemmap block allocation can fail due to
1177                                  * alignment requirements and we trying to align
1178                                  * things aggressively there by running out of
1179                                  * space. Try base mapping on failure.
1180                                  */
1181                                 goto base_mapping;
1182                         }
1183                 } else if (vmemmap_check_pmd(pmd, node, addr, next)) {
1184                         /*
1185                          * If a huge mapping exist due to early call to
1186                          * vmemmap_populate, let's try to use that.
1187                          */
1188                         continue;
1189                 }
1190 base_mapping:
1191                 /*
1192                  * Not able allocate higher order memory to back memmap
1193                  * or we found a pointer to pte page. Allocate base page
1194                  * size vmemmap
1195                  */
1196                 pte = vmemmap_pte_alloc(pmd, node, addr);
1197                 if (!pte)
1198                         return -ENOMEM;
1199 
1200                 pte = radix__vmemmap_pte_populate(pmd, addr, node, altmap, NULL);
1201                 if (!pte)
1202                         return -ENOMEM;
1203 
1204                 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1205                 next = addr + PAGE_SIZE;
1206         }
1207         return 0;
1208 }
1209 
1210 static pte_t * __meminit radix__vmemmap_populate_address(unsigned long addr, int node,
1211                                                          struct vmem_altmap *altmap,
1212                                                          struct page *reuse)
1213 {
1214         pgd_t *pgd;
1215         p4d_t *p4d;
1216         pud_t *pud;
1217         pmd_t *pmd;
1218         pte_t *pte;
1219 
1220         pgd = pgd_offset_k(addr);
1221         p4d = p4d_offset(pgd, addr);
1222         pud = vmemmap_pud_alloc(p4d, node, addr);
1223         if (!pud)
1224                 return NULL;
1225         pmd = vmemmap_pmd_alloc(pud, node, addr);
1226         if (!pmd)
1227                 return NULL;
1228         if (pmd_leaf(*pmd))
1229                 /*
1230                  * The second page is mapped as a hugepage due to a nearby request.
1231                  * Force our mapping to page size without deduplication
1232                  */
1233                 return NULL;
1234         pte = vmemmap_pte_alloc(pmd, node, addr);
1235         if (!pte)
1236                 return NULL;
1237         radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1238         vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1239 
1240         return pte;
1241 }
1242 
1243 static pte_t * __meminit vmemmap_compound_tail_page(unsigned long addr,
1244                                                     unsigned long pfn_offset, int node)
1245 {
1246         pgd_t *pgd;
1247         p4d_t *p4d;
1248         pud_t *pud;
1249         pmd_t *pmd;
1250         pte_t *pte;
1251         unsigned long map_addr;
1252 
1253         /* the second vmemmap page which we use for duplication */
1254         map_addr = addr - pfn_offset * sizeof(struct page) + PAGE_SIZE;
1255         pgd = pgd_offset_k(map_addr);
1256         p4d = p4d_offset(pgd, map_addr);
1257         pud = vmemmap_pud_alloc(p4d, node, map_addr);
1258         if (!pud)
1259                 return NULL;
1260         pmd = vmemmap_pmd_alloc(pud, node, map_addr);
1261         if (!pmd)
1262                 return NULL;
1263         if (pmd_leaf(*pmd))
1264                 /*
1265                  * The second page is mapped as a hugepage due to a nearby request.
1266                  * Force our mapping to page size without deduplication
1267                  */
1268                 return NULL;
1269         pte = vmemmap_pte_alloc(pmd, node, map_addr);
1270         if (!pte)
1271                 return NULL;
1272         /*
1273          * Check if there exist a mapping to the left
1274          */
1275         if (pte_none(*pte)) {
1276                 /*
1277                  * Populate the head page vmemmap page.
1278                  * It can fall in different pmd, hence
1279                  * vmemmap_populate_address()
1280                  */
1281                 pte = radix__vmemmap_populate_address(map_addr - PAGE_SIZE, node, NULL, NULL);
1282                 if (!pte)
1283                         return NULL;
1284                 /*
1285                  * Populate the tail pages vmemmap page
1286                  */
1287                 pte = radix__vmemmap_pte_populate(pmd, map_addr, node, NULL, NULL);
1288                 if (!pte)
1289                         return NULL;
1290                 vmemmap_verify(pte, node, map_addr, map_addr + PAGE_SIZE);
1291                 return pte;
1292         }
1293         return pte;
1294 }
1295 
1296 int __meminit vmemmap_populate_compound_pages(unsigned long start_pfn,
1297                                               unsigned long start,
1298                                               unsigned long end, int node,
1299                                               struct dev_pagemap *pgmap)
1300 {
1301         /*
1302          * we want to map things as base page size mapping so that
1303          * we can save space in vmemmap. We could have huge mapping
1304          * covering out both edges.
1305          */
1306         unsigned long addr;
1307         unsigned long addr_pfn = start_pfn;
1308         unsigned long next;
1309         pgd_t *pgd;
1310         p4d_t *p4d;
1311         pud_t *pud;
1312         pmd_t *pmd;
1313         pte_t *pte;
1314 
1315         for (addr = start; addr < end; addr = next) {
1316 
1317                 pgd = pgd_offset_k(addr);
1318                 p4d = p4d_offset(pgd, addr);
1319                 pud = vmemmap_pud_alloc(p4d, node, addr);
1320                 if (!pud)
1321                         return -ENOMEM;
1322                 pmd = vmemmap_pmd_alloc(pud, node, addr);
1323                 if (!pmd)
1324                         return -ENOMEM;
1325 
1326                 if (pmd_leaf(READ_ONCE(*pmd))) {
1327                         /* existing huge mapping. Skip the range */
1328                         addr_pfn += (PMD_SIZE >> PAGE_SHIFT);
1329                         next = pmd_addr_end(addr, end);
1330                         continue;
1331                 }
1332                 pte = vmemmap_pte_alloc(pmd, node, addr);
1333                 if (!pte)
1334                         return -ENOMEM;
1335                 if (!pte_none(*pte)) {
1336                         /*
1337                          * This could be because we already have a compound
1338                          * page whose VMEMMAP_RESERVE_NR pages were mapped and
1339                          * this request fall in those pages.
1340                          */
1341                         addr_pfn += 1;
1342                         next = addr + PAGE_SIZE;
1343                         continue;
1344                 } else {
1345                         unsigned long nr_pages = pgmap_vmemmap_nr(pgmap);
1346                         unsigned long pfn_offset = addr_pfn - ALIGN_DOWN(addr_pfn, nr_pages);
1347                         pte_t *tail_page_pte;
1348 
1349                         /*
1350                          * if the address is aligned to huge page size it is the
1351                          * head mapping.
1352                          */
1353                         if (pfn_offset == 0) {
1354                                 /* Populate the head page vmemmap page */
1355                                 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1356                                 if (!pte)
1357                                         return -ENOMEM;
1358                                 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1359 
1360                                 /*
1361                                  * Populate the tail pages vmemmap page
1362                                  * It can fall in different pmd, hence
1363                                  * vmemmap_populate_address()
1364                                  */
1365                                 pte = radix__vmemmap_populate_address(addr + PAGE_SIZE, node, NULL, NULL);
1366                                 if (!pte)
1367                                         return -ENOMEM;
1368 
1369                                 addr_pfn += 2;
1370                                 next = addr + 2 * PAGE_SIZE;
1371                                 continue;
1372                         }
1373                         /*
1374                          * get the 2nd mapping details
1375                          * Also create it if that doesn't exist
1376                          */
1377                         tail_page_pte = vmemmap_compound_tail_page(addr, pfn_offset, node);
1378                         if (!tail_page_pte) {
1379 
1380                                 pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, NULL);
1381                                 if (!pte)
1382                                         return -ENOMEM;
1383                                 vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1384 
1385                                 addr_pfn += 1;
1386                                 next = addr + PAGE_SIZE;
1387                                 continue;
1388                         }
1389 
1390                         pte = radix__vmemmap_pte_populate(pmd, addr, node, NULL, pte_page(*tail_page_pte));
1391                         if (!pte)
1392                                 return -ENOMEM;
1393                         vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
1394 
1395                         addr_pfn += 1;
1396                         next = addr + PAGE_SIZE;
1397                         continue;
1398                 }
1399         }
1400         return 0;
1401 }
1402 
1403 
1404 #ifdef CONFIG_MEMORY_HOTPLUG
1405 void __meminit radix__vmemmap_remove_mapping(unsigned long start, unsigned long page_size)
1406 {
1407         remove_pagetable(start, start + page_size, true, NULL);
1408 }
1409 
1410 void __ref radix__vmemmap_free(unsigned long start, unsigned long end,
1411                                struct vmem_altmap *altmap)
1412 {
1413         remove_pagetable(start, end, false, altmap);
1414 }
1415 #endif
1416 #endif
1417 
1418 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
1419 
1420 unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr,
1421                                   pmd_t *pmdp, unsigned long clr,
1422                                   unsigned long set)
1423 {
1424         unsigned long old;
1425 
1426 #ifdef CONFIG_DEBUG_VM
1427         WARN_ON(!radix__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp));
1428         assert_spin_locked(pmd_lockptr(mm, pmdp));
1429 #endif
1430 
1431         old = radix__pte_update(mm, addr, pmdp_ptep(pmdp), clr, set, 1);
1432         trace_hugepage_update_pmd(addr, old, clr, set);
1433 
1434         return old;
1435 }
1436 
1437 unsigned long radix__pud_hugepage_update(struct mm_struct *mm, unsigned long addr,
1438                                          pud_t *pudp, unsigned long clr,
1439                                          unsigned long set)
1440 {
1441         unsigned long old;
1442 
1443 #ifdef CONFIG_DEBUG_VM
1444         WARN_ON(!pud_devmap(*pudp));
1445         assert_spin_locked(pud_lockptr(mm, pudp));
1446 #endif
1447 
1448         old = radix__pte_update(mm, addr, pudp_ptep(pudp), clr, set, 1);
1449         trace_hugepage_update_pud(addr, old, clr, set);
1450 
1451         return old;
1452 }
1453 
1454 pmd_t radix__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
1455                         pmd_t *pmdp)
1456 
1457 {
1458         pmd_t pmd;
1459 
1460         VM_BUG_ON(address & ~HPAGE_PMD_MASK);
1461         VM_BUG_ON(radix__pmd_trans_huge(*pmdp));
1462         VM_BUG_ON(pmd_devmap(*pmdp));
1463         /*
1464          * khugepaged calls this for normal pmd
1465          */
1466         pmd = *pmdp;
1467         pmd_clear(pmdp);
1468 
1469         radix__flush_tlb_collapsed_pmd(vma->vm_mm, address);
1470 
1471         return pmd;
1472 }
1473 
1474 /*
1475  * For us pgtable_t is pte_t *. Inorder to save the deposisted
1476  * page table, we consider the allocated page table as a list
1477  * head. On withdraw we need to make sure we zero out the used
1478  * list_head memory area.
1479  */
1480 void radix__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
1481                                  pgtable_t pgtable)
1482 {
1483         struct list_head *lh = (struct list_head *) pgtable;
1484 
1485         assert_spin_locked(pmd_lockptr(mm, pmdp));
1486 
1487         /* FIFO */
1488         if (!pmd_huge_pte(mm, pmdp))
1489                 INIT_LIST_HEAD(lh);
1490         else
1491                 list_add(lh, (struct list_head *) pmd_huge_pte(mm, pmdp));
1492         pmd_huge_pte(mm, pmdp) = pgtable;
1493 }
1494 
1495 pgtable_t radix__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
1496 {
1497         pte_t *ptep;
1498         pgtable_t pgtable;
1499         struct list_head *lh;
1500 
1501         assert_spin_locked(pmd_lockptr(mm, pmdp));
1502 
1503         /* FIFO */
1504         pgtable = pmd_huge_pte(mm, pmdp);
1505         lh = (struct list_head *) pgtable;
1506         if (list_empty(lh))
1507                 pmd_huge_pte(mm, pmdp) = NULL;
1508         else {
1509                 pmd_huge_pte(mm, pmdp) = (pgtable_t) lh->next;
1510                 list_del(lh);
1511         }
1512         ptep = (pte_t *) pgtable;
1513         *ptep = __pte(0);
1514         ptep++;
1515         *ptep = __pte(0);
1516         return pgtable;
1517 }
1518 
1519 pmd_t radix__pmdp_huge_get_and_clear(struct mm_struct *mm,
1520                                      unsigned long addr, pmd_t *pmdp)
1521 {
1522         pmd_t old_pmd;
1523         unsigned long old;
1524 
1525         old = radix__pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0);
1526         old_pmd = __pmd(old);
1527         return old_pmd;
1528 }
1529 
1530 pud_t radix__pudp_huge_get_and_clear(struct mm_struct *mm,
1531                                      unsigned long addr, pud_t *pudp)
1532 {
1533         pud_t old_pud;
1534         unsigned long old;
1535 
1536         old = radix__pud_hugepage_update(mm, addr, pudp, ~0UL, 0);
1537         old_pud = __pud(old);
1538         return old_pud;
1539 }
1540 
1541 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
1542 
1543 void radix__ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep,
1544                                   pte_t entry, unsigned long address, int psize)
1545 {
1546         struct mm_struct *mm = vma->vm_mm;
1547         unsigned long set = pte_val(entry) & (_PAGE_DIRTY | _PAGE_SOFT_DIRTY |
1548                                               _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC);
1549 
1550         unsigned long change = pte_val(entry) ^ pte_val(*ptep);
1551         /*
1552          * On POWER9, the NMMU is not able to relax PTE access permissions
1553          * for a translation with a TLB. The PTE must be invalidated, TLB
1554          * flushed before the new PTE is installed.
1555          *
1556          * This only needs to be done for radix, because hash translation does
1557          * flush when updating the linux pte (and we don't support NMMU
1558          * accelerators on HPT on POWER9 anyway XXX: do we?).
1559          *
1560          * POWER10 (and P9P) NMMU does behave as per ISA.
1561          */
1562         if (!cpu_has_feature(CPU_FTR_ARCH_31) && (change & _PAGE_RW) &&
1563             atomic_read(&mm->context.copros) > 0) {
1564                 unsigned long old_pte, new_pte;
1565 
1566                 old_pte = __radix_pte_update(ptep, _PAGE_PRESENT, _PAGE_INVALID);
1567                 new_pte = old_pte | set;
1568                 radix__flush_tlb_page_psize(mm, address, psize);
1569                 __radix_pte_update(ptep, _PAGE_INVALID, new_pte);
1570         } else {
1571                 __radix_pte_update(ptep, 0, set);
1572                 /*
1573                  * Book3S does not require a TLB flush when relaxing access
1574                  * restrictions when the address space (modulo the POWER9 nest
1575                  * MMU issue above) because the MMU will reload the PTE after
1576                  * taking an access fault, as defined by the architecture. See
1577                  * "Setting a Reference or Change Bit or Upgrading Access
1578                  *  Authority (PTE Subject to Atomic Hardware Updates)" in
1579                  *  Power ISA Version 3.1B.
1580                  */
1581         }
1582         /* See ptesync comment in radix__set_pte_at */
1583 }
1584 
1585 void radix__ptep_modify_prot_commit(struct vm_area_struct *vma,
1586                                     unsigned long addr, pte_t *ptep,
1587                                     pte_t old_pte, pte_t pte)
1588 {
1589         struct mm_struct *mm = vma->vm_mm;
1590 
1591         /*
1592          * POWER9 NMMU must flush the TLB after clearing the PTE before
1593          * installing a PTE with more relaxed access permissions, see
1594          * radix__ptep_set_access_flags.
1595          */
1596         if (!cpu_has_feature(CPU_FTR_ARCH_31) &&
1597             is_pte_rw_upgrade(pte_val(old_pte), pte_val(pte)) &&
1598             (atomic_read(&mm->context.copros) > 0))
1599                 radix__flush_tlb_page(vma, addr);
1600 
1601         set_pte_at(mm, addr, ptep, pte);
1602 }
1603 
1604 int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot)
1605 {
1606         pte_t *ptep = (pte_t *)pud;
1607         pte_t new_pud = pfn_pte(__phys_to_pfn(addr), prot);
1608 
1609         if (!radix_enabled())
1610                 return 0;
1611 
1612         set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pud);
1613 
1614         return 1;
1615 }
1616 
1617 int pud_clear_huge(pud_t *pud)
1618 {
1619         if (pud_leaf(*pud)) {
1620                 pud_clear(pud);
1621                 return 1;
1622         }
1623 
1624         return 0;
1625 }
1626 
1627 int pud_free_pmd_page(pud_t *pud, unsigned long addr)
1628 {
1629         pmd_t *pmd;
1630         int i;
1631 
1632         pmd = pud_pgtable(*pud);
1633         pud_clear(pud);
1634 
1635         flush_tlb_kernel_range(addr, addr + PUD_SIZE);
1636 
1637         for (i = 0; i < PTRS_PER_PMD; i++) {
1638                 if (!pmd_none(pmd[i])) {
1639                         pte_t *pte;
1640                         pte = (pte_t *)pmd_page_vaddr(pmd[i]);
1641 
1642                         pte_free_kernel(&init_mm, pte);
1643                 }
1644         }
1645 
1646         pmd_free(&init_mm, pmd);
1647 
1648         return 1;
1649 }
1650 
1651 int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot)
1652 {
1653         pte_t *ptep = (pte_t *)pmd;
1654         pte_t new_pmd = pfn_pte(__phys_to_pfn(addr), prot);
1655 
1656         if (!radix_enabled())
1657                 return 0;
1658 
1659         set_pte_at(&init_mm, 0 /* radix unused */, ptep, new_pmd);
1660 
1661         return 1;
1662 }
1663 
1664 int pmd_clear_huge(pmd_t *pmd)
1665 {
1666         if (pmd_leaf(*pmd)) {
1667                 pmd_clear(pmd);
1668                 return 1;
1669         }
1670 
1671         return 0;
1672 }
1673 
1674 int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
1675 {
1676         pte_t *pte;
1677 
1678         pte = (pte_t *)pmd_page_vaddr(*pmd);
1679         pmd_clear(pmd);
1680 
1681         flush_tlb_kernel_range(addr, addr + PMD_SIZE);
1682 
1683         pte_free_kernel(&init_mm, pte);
1684 
1685         return 1;
1686 }
1687 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

sflogo.php