1 // SPDX-License-Identifier: GPL-2.0-only << 2 /* 1 /* 3 * kexec.c - kexec_load system call !! 2 * kexec.c - kexec system call 4 * Copyright (C) 2002-2004 Eric Biederman <eb 3 * Copyright (C) 2002-2004 Eric Biederman <ebiederm@xmission.com> >> 4 * >> 5 * This source code is licensed under the GNU General Public License, >> 6 * Version 2. See the file COPYING for more details. 5 */ 7 */ 6 8 7 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt << 8 << 9 #include <linux/capability.h> 9 #include <linux/capability.h> 10 #include <linux/mm.h> 10 #include <linux/mm.h> 11 #include <linux/file.h> 11 #include <linux/file.h> 12 #include <linux/security.h> !! 12 #include <linux/slab.h> >> 13 #include <linux/fs.h> 13 #include <linux/kexec.h> 14 #include <linux/kexec.h> 14 #include <linux/mutex.h> 15 #include <linux/mutex.h> 15 #include <linux/list.h> 16 #include <linux/list.h> >> 17 #include <linux/highmem.h> 16 #include <linux/syscalls.h> 18 #include <linux/syscalls.h> >> 19 #include <linux/reboot.h> >> 20 #include <linux/ioport.h> >> 21 #include <linux/hardirq.h> >> 22 #include <linux/elf.h> >> 23 #include <linux/elfcore.h> >> 24 #include <linux/utsname.h> >> 25 #include <linux/numa.h> >> 26 #include <linux/suspend.h> >> 27 #include <linux/device.h> >> 28 #include <linux/freezer.h> >> 29 #include <linux/pm.h> >> 30 #include <linux/cpu.h> >> 31 #include <linux/console.h> 17 #include <linux/vmalloc.h> 32 #include <linux/vmalloc.h> 18 #include <linux/slab.h> !! 33 #include <linux/swap.h> >> 34 #include <linux/syscore_ops.h> >> 35 >> 36 #include <asm/page.h> >> 37 #include <asm/uaccess.h> >> 38 #include <asm/io.h> >> 39 #include <asm/sections.h> 19 #include <linux/ccsecurity.h> 40 #include <linux/ccsecurity.h> 20 #include "kexec_internal.h" << 21 41 22 static int kimage_alloc_init(struct kimage **r !! 42 /* Per cpu memory for storing cpu states in case of system crash. */ 23 unsigned long nr_ !! 43 note_buf_t __percpu *crash_notes; 24 struct kexec_segm !! 44 25 unsigned long fla !! 45 /* vmcoreinfo stuff */ >> 46 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; >> 47 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; >> 48 size_t vmcoreinfo_size; >> 49 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); >> 50 >> 51 /* Flag to indicate we are going to kexec a new kernel */ >> 52 bool kexec_in_progress = false; >> 53 >> 54 /* Location of the reserved area for the crash kernel */ >> 55 struct resource crashk_res = { >> 56 .name = "Crash kernel", >> 57 .start = 0, >> 58 .end = 0, >> 59 .flags = IORESOURCE_BUSY | IORESOURCE_MEM >> 60 }; >> 61 struct resource crashk_low_res = { >> 62 .name = "Crash kernel", >> 63 .start = 0, >> 64 .end = 0, >> 65 .flags = IORESOURCE_BUSY | IORESOURCE_MEM >> 66 }; >> 67 >> 68 int kexec_should_crash(struct task_struct *p) 26 { 69 { 27 int ret; !! 70 if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) 28 struct kimage *image; !! 71 return 1; 29 bool kexec_on_panic = flags & KEXEC_ON !! 72 return 0; >> 73 } 30 74 31 #ifdef CONFIG_CRASH_DUMP !! 75 /* 32 if (kexec_on_panic) { !! 76 * When kexec transitions to the new kernel there is a one-to-one 33 /* Verify we have a valid entr !! 77 * mapping between physical and virtual addresses. On processors 34 if ((entry < phys_to_boot_phys !! 78 * where you can disable the MMU this is trivial, and easy. For 35 (entry > phys_to_boot_phys !! 79 * others it is still a simple predictable page table to setup. 36 return -EADDRNOTAVAIL; !! 80 * 37 } !! 81 * In that environment kexec copies the new kernel to its final 38 #endif !! 82 * resting place. This means I can only support memory whose >> 83 * physical address can fit in an unsigned long. In particular >> 84 * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled. >> 85 * If the assembly stub has more restrictive requirements >> 86 * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be >> 87 * defined more restrictively in <asm/kexec.h>. >> 88 * >> 89 * The code for the transition from the current kernel to the >> 90 * the new kernel is placed in the control_code_buffer, whose size >> 91 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single >> 92 * page of memory is necessary, but some architectures require more. >> 93 * Because this memory must be identity mapped in the transition from >> 94 * virtual to physical addresses it must live in the range >> 95 * 0 - TASK_SIZE, as only the user space mappings are arbitrarily >> 96 * modifiable. >> 97 * >> 98 * The assembly stub in the control code buffer is passed a linked list >> 99 * of descriptor pages detailing the source pages of the new kernel, >> 100 * and the destination addresses of those source pages. As this data >> 101 * structure is not used in the context of the current OS, it must >> 102 * be self-contained. >> 103 * >> 104 * The code has been made to work with highmem pages and will use a >> 105 * destination page in its final resting place (if it happens >> 106 * to allocate it). The end product of this is that most of the >> 107 * physical address space, and most of RAM can be used. >> 108 * >> 109 * Future directions include: >> 110 * - allocating a page table with the control code buffer identity >> 111 * mapped, to simplify machine_kexec and make kexec_on_panic more >> 112 * reliable. >> 113 */ 39 114 40 /* Allocate and initialize a controlli !! 115 /* 41 image = do_kimage_alloc_init(); !! 116 * KIMAGE_NO_DEST is an impossible destination address..., for >> 117 * allocating pages whose destination address we do not care about. >> 118 */ >> 119 #define KIMAGE_NO_DEST (-1UL) >> 120 >> 121 static int kimage_is_destination_range(struct kimage *image, >> 122 unsigned long start, unsigned long end); >> 123 static struct page *kimage_alloc_page(struct kimage *image, >> 124 gfp_t gfp_mask, >> 125 unsigned long dest); >> 126 >> 127 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, >> 128 unsigned long nr_segments, >> 129 struct kexec_segment __user *segments) >> 130 { >> 131 size_t segment_bytes; >> 132 struct kimage *image; >> 133 unsigned long i; >> 134 int result; >> 135 >> 136 /* Allocate a controlling structure */ >> 137 result = -ENOMEM; >> 138 image = kzalloc(sizeof(*image), GFP_KERNEL); 42 if (!image) 139 if (!image) 43 return -ENOMEM; !! 140 goto out; 44 141 >> 142 image->head = 0; >> 143 image->entry = &image->head; >> 144 image->last_entry = &image->head; >> 145 image->control_page = ~0; /* By default this does not apply */ 45 image->start = entry; 146 image->start = entry; >> 147 image->type = KEXEC_TYPE_DEFAULT; >> 148 >> 149 /* Initialize the list of control pages */ >> 150 INIT_LIST_HEAD(&image->control_pages); >> 151 >> 152 /* Initialize the list of destination pages */ >> 153 INIT_LIST_HEAD(&image->dest_pages); >> 154 >> 155 /* Initialize the list of unusable pages */ >> 156 INIT_LIST_HEAD(&image->unuseable_pages); >> 157 >> 158 /* Read in the segments */ 46 image->nr_segments = nr_segments; 159 image->nr_segments = nr_segments; 47 memcpy(image->segment, segments, nr_se !! 160 segment_bytes = nr_segments * sizeof(*segments); >> 161 result = copy_from_user(image->segment, segments, segment_bytes); >> 162 if (result) { >> 163 result = -EFAULT; >> 164 goto out; >> 165 } >> 166 >> 167 /* >> 168 * Verify we have good destination addresses. The caller is >> 169 * responsible for making certain we don't attempt to load >> 170 * the new image into invalid or reserved areas of RAM. This >> 171 * just verifies it is an address we can use. >> 172 * >> 173 * Since the kernel does everything in page size chunks ensure >> 174 * the destination addresses are page aligned. Too many >> 175 * special cases crop of when we don't do this. The most >> 176 * insidious is getting overlapping destination addresses >> 177 * simply because addresses are changed to page size >> 178 * granularity. >> 179 */ >> 180 result = -EADDRNOTAVAIL; >> 181 for (i = 0; i < nr_segments; i++) { >> 182 unsigned long mstart, mend; >> 183 >> 184 mstart = image->segment[i].mem; >> 185 mend = mstart + image->segment[i].memsz; >> 186 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) >> 187 goto out; >> 188 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT) >> 189 goto out; >> 190 } >> 191 >> 192 /* Verify our destination addresses do not overlap. >> 193 * If we alloed overlapping destination addresses >> 194 * through very weird things can happen with no >> 195 * easy explanation as one segment stops on another. >> 196 */ >> 197 result = -EINVAL; >> 198 for (i = 0; i < nr_segments; i++) { >> 199 unsigned long mstart, mend; >> 200 unsigned long j; 48 201 49 #ifdef CONFIG_CRASH_DUMP !! 202 mstart = image->segment[i].mem; 50 if (kexec_on_panic) { !! 203 mend = mstart + image->segment[i].memsz; 51 /* Enable special crash kernel !! 204 for (j = 0; j < i; j++) { 52 image->control_page = crashk_r !! 205 unsigned long pstart, pend; 53 image->type = KEXEC_TYPE_CRASH !! 206 pstart = image->segment[j].mem; >> 207 pend = pstart + image->segment[j].memsz; >> 208 /* Do the segments overlap ? */ >> 209 if ((mend > pstart) && (mstart < pend)) >> 210 goto out; >> 211 } 54 } 212 } 55 #endif << 56 213 57 ret = sanity_check_segment_list(image) !! 214 /* Ensure our buffer sizes are strictly less than 58 if (ret) !! 215 * our memory sizes. This should always be the case, 59 goto out_free_image; !! 216 * and it is easier to check up front than to be surprised >> 217 * later on. >> 218 */ >> 219 result = -EINVAL; >> 220 for (i = 0; i < nr_segments; i++) { >> 221 if (image->segment[i].bufsz > image->segment[i].memsz) >> 222 goto out; >> 223 } >> 224 >> 225 result = 0; >> 226 out: >> 227 if (result == 0) >> 228 *rimage = image; >> 229 else >> 230 kfree(image); >> 231 >> 232 return result; >> 233 >> 234 } >> 235 >> 236 static void kimage_free_page_list(struct list_head *list); >> 237 >> 238 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, >> 239 unsigned long nr_segments, >> 240 struct kexec_segment __user *segments) >> 241 { >> 242 int result; >> 243 struct kimage *image; >> 244 >> 245 /* Allocate and initialize a controlling structure */ >> 246 image = NULL; >> 247 result = do_kimage_alloc(&image, entry, nr_segments, segments); >> 248 if (result) >> 249 goto out; 60 250 61 /* 251 /* 62 * Find a location for the control cod 252 * Find a location for the control code buffer, and add it 63 * the vector of segments so that it's 253 * the vector of segments so that it's pages will also be 64 * counted as destination pages. 254 * counted as destination pages. 65 */ 255 */ 66 ret = -ENOMEM; !! 256 result = -ENOMEM; 67 image->control_code_page = kimage_allo 257 image->control_code_page = kimage_alloc_control_pages(image, 68 get 258 get_order(KEXEC_CONTROL_PAGE_SIZE)); 69 if (!image->control_code_page) { 259 if (!image->control_code_page) { 70 pr_err("Could not allocate con !! 260 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 71 goto out_free_image; !! 261 goto out_free; 72 } 262 } 73 263 74 if (!kexec_on_panic) { !! 264 image->swap_page = kimage_alloc_control_pages(image, 0); 75 image->swap_page = kimage_allo !! 265 if (!image->swap_page) { 76 if (!image->swap_page) { !! 266 printk(KERN_ERR "Could not allocate swap buffer\n"); 77 pr_err("Could not allo !! 267 goto out_free; 78 goto out_free_control_ << 79 } << 80 } 268 } 81 269 82 *rimage = image; 270 *rimage = image; 83 return 0; 271 return 0; 84 out_free_control_pages: !! 272 >> 273 out_free: 85 kimage_free_page_list(&image->control_ 274 kimage_free_page_list(&image->control_pages); 86 out_free_image: << 87 kfree(image); 275 kfree(image); 88 return ret; !! 276 out: >> 277 return result; 89 } 278 } 90 279 91 static int do_kexec_load(unsigned long entry, !! 280 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, 92 struct kexec_segment *segments !! 281 unsigned long nr_segments, >> 282 struct kexec_segment __user *segments) 93 { 283 { 94 struct kimage **dest_image, *image; !! 284 int result; >> 285 struct kimage *image; 95 unsigned long i; 286 unsigned long i; 96 int ret; !! 287 >> 288 image = NULL; >> 289 /* Verify we have a valid entry point */ >> 290 if ((entry < crashk_res.start) || (entry > crashk_res.end)) { >> 291 result = -EADDRNOTAVAIL; >> 292 goto out; >> 293 } >> 294 >> 295 /* Allocate and initialize a controlling structure */ >> 296 result = do_kimage_alloc(&image, entry, nr_segments, segments); >> 297 if (result) >> 298 goto out; >> 299 >> 300 /* Enable the special crash kernel control page >> 301 * allocation policy. >> 302 */ >> 303 image->control_page = crashk_res.start; >> 304 image->type = KEXEC_TYPE_CRASH; 97 305 98 /* 306 /* 99 * Because we write directly to the re !! 307 * Verify we have good destination addresses. Normally 100 * crash kernels we need a serializati !! 308 * the caller is responsible for making certain we don't 101 * kernels from attempting to load sim !! 309 * attempt to load the new image into invalid or reserved >> 310 * areas of RAM. But crash kernels are preloaded into a >> 311 * reserved area of ram. We must ensure the addresses >> 312 * are in the reserved area otherwise preloading the >> 313 * kernel could corrupt things. 102 */ 314 */ 103 if (!kexec_trylock()) !! 315 result = -EADDRNOTAVAIL; 104 return -EBUSY; !! 316 for (i = 0; i < nr_segments; i++) { >> 317 unsigned long mstart, mend; 105 318 106 #ifdef CONFIG_CRASH_DUMP !! 319 mstart = image->segment[i].mem; 107 if (flags & KEXEC_ON_CRASH) { !! 320 mend = mstart + image->segment[i].memsz - 1; 108 dest_image = &kexec_crash_imag !! 321 /* Ensure we are within the crash kernel limits */ 109 if (kexec_crash_image) !! 322 if ((mstart < crashk_res.start) || (mend > crashk_res.end)) 110 arch_kexec_unprotect_c !! 323 goto out_free; 111 } else !! 324 } 112 #endif !! 325 113 dest_image = &kexec_image; !! 326 /* >> 327 * Find a location for the control code buffer, and add >> 328 * the vector of segments so that it's pages will also be >> 329 * counted as destination pages. >> 330 */ >> 331 result = -ENOMEM; >> 332 image->control_code_page = kimage_alloc_control_pages(image, >> 333 get_order(KEXEC_CONTROL_PAGE_SIZE)); >> 334 if (!image->control_code_page) { >> 335 printk(KERN_ERR "Could not allocate control_code_buffer\n"); >> 336 goto out_free; >> 337 } >> 338 >> 339 *rimage = image; >> 340 return 0; >> 341 >> 342 out_free: >> 343 kfree(image); >> 344 out: >> 345 return result; >> 346 } >> 347 >> 348 static int kimage_is_destination_range(struct kimage *image, >> 349 unsigned long start, >> 350 unsigned long end) >> 351 { >> 352 unsigned long i; 114 353 115 if (nr_segments == 0) { !! 354 for (i = 0; i < image->nr_segments; i++) { 116 /* Uninstall image */ !! 355 unsigned long mstart, mend; 117 kimage_free(xchg(dest_image, N !! 356 118 ret = 0; !! 357 mstart = image->segment[i].mem; 119 goto out_unlock; !! 358 mend = mstart + image->segment[i].memsz; >> 359 if ((end > mstart) && (start < mend)) >> 360 return 1; 120 } 361 } 121 if (flags & KEXEC_ON_CRASH) { !! 362 122 /* !! 363 return 0; 123 * Loading another kernel to s !! 364 } 124 * crashes. Free any current !! 365 125 * we corrupt it. !! 366 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) >> 367 { >> 368 struct page *pages; >> 369 >> 370 pages = alloc_pages(gfp_mask, order); >> 371 if (pages) { >> 372 unsigned int count, i; >> 373 pages->mapping = NULL; >> 374 set_page_private(pages, order); >> 375 count = 1 << order; >> 376 for (i = 0; i < count; i++) >> 377 SetPageReserved(pages + i); >> 378 } >> 379 >> 380 return pages; >> 381 } >> 382 >> 383 static void kimage_free_pages(struct page *page) >> 384 { >> 385 unsigned int order, count, i; >> 386 >> 387 order = page_private(page); >> 388 count = 1 << order; >> 389 for (i = 0; i < count; i++) >> 390 ClearPageReserved(page + i); >> 391 __free_pages(page, order); >> 392 } >> 393 >> 394 static void kimage_free_page_list(struct list_head *list) >> 395 { >> 396 struct list_head *pos, *next; >> 397 >> 398 list_for_each_safe(pos, next, list) { >> 399 struct page *page; >> 400 >> 401 page = list_entry(pos, struct page, lru); >> 402 list_del(&page->lru); >> 403 kimage_free_pages(page); >> 404 } >> 405 } >> 406 >> 407 static struct page *kimage_alloc_normal_control_pages(struct kimage *image, >> 408 unsigned int order) >> 409 { >> 410 /* Control pages are special, they are the intermediaries >> 411 * that are needed while we copy the rest of the pages >> 412 * to their final resting place. As such they must >> 413 * not conflict with either the destination addresses >> 414 * or memory the kernel is already using. >> 415 * >> 416 * The only case where we really need more than one of >> 417 * these are for architectures where we cannot disable >> 418 * the MMU and must instead generate an identity mapped >> 419 * page table for all of the memory. >> 420 * >> 421 * At worst this runs in O(N) of the image size. >> 422 */ >> 423 struct list_head extra_pages; >> 424 struct page *pages; >> 425 unsigned int count; >> 426 >> 427 count = 1 << order; >> 428 INIT_LIST_HEAD(&extra_pages); >> 429 >> 430 /* Loop while I can allocate a page and the page allocated >> 431 * is a destination page. >> 432 */ >> 433 do { >> 434 unsigned long pfn, epfn, addr, eaddr; >> 435 >> 436 pages = kimage_alloc_pages(GFP_KERNEL, order); >> 437 if (!pages) >> 438 break; >> 439 pfn = page_to_pfn(pages); >> 440 epfn = pfn + count; >> 441 addr = pfn << PAGE_SHIFT; >> 442 eaddr = epfn << PAGE_SHIFT; >> 443 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || >> 444 kimage_is_destination_range(image, addr, eaddr)) { >> 445 list_add(&pages->lru, &extra_pages); >> 446 pages = NULL; >> 447 } >> 448 } while (!pages); >> 449 >> 450 if (pages) { >> 451 /* Remember the allocated page... */ >> 452 list_add(&pages->lru, &image->control_pages); >> 453 >> 454 /* Because the page is already in it's destination >> 455 * location we will never allocate another page at >> 456 * that address. Therefore kimage_alloc_pages >> 457 * will not return it (again) and we don't need >> 458 * to give it an entry in image->segment[]. 126 */ 459 */ 127 kimage_free(xchg(&kexec_crash_ << 128 } 460 } >> 461 /* Deal with the destination pages I have inadvertently allocated. >> 462 * >> 463 * Ideally I would convert multi-page allocations into single >> 464 * page allocations, and add everything to image->dest_pages. >> 465 * >> 466 * For now it is simpler to just free the pages. >> 467 */ >> 468 kimage_free_page_list(&extra_pages); >> 469 >> 470 return pages; >> 471 } >> 472 >> 473 static struct page *kimage_alloc_crash_control_pages(struct kimage *image, >> 474 unsigned int order) >> 475 { >> 476 /* Control pages are special, they are the intermediaries >> 477 * that are needed while we copy the rest of the pages >> 478 * to their final resting place. As such they must >> 479 * not conflict with either the destination addresses >> 480 * or memory the kernel is already using. >> 481 * >> 482 * Control pages are also the only pags we must allocate >> 483 * when loading a crash kernel. All of the other pages >> 484 * are specified by the segments and we just memcpy >> 485 * into them directly. >> 486 * >> 487 * The only case where we really need more than one of >> 488 * these are for architectures where we cannot disable >> 489 * the MMU and must instead generate an identity mapped >> 490 * page table for all of the memory. >> 491 * >> 492 * Given the low demand this implements a very simple >> 493 * allocator that finds the first hole of the appropriate >> 494 * size in the reserved memory region, and allocates all >> 495 * of the memory up to and including the hole. >> 496 */ >> 497 unsigned long hole_start, hole_end, size; >> 498 struct page *pages; >> 499 >> 500 pages = NULL; >> 501 size = (1 << order) << PAGE_SHIFT; >> 502 hole_start = (image->control_page + (size - 1)) & ~(size - 1); >> 503 hole_end = hole_start + size - 1; >> 504 while (hole_end <= crashk_res.end) { >> 505 unsigned long i; >> 506 >> 507 if (hole_end > KEXEC_CRASH_CONTROL_MEMORY_LIMIT) >> 508 break; >> 509 /* See if I overlap any of the segments */ >> 510 for (i = 0; i < image->nr_segments; i++) { >> 511 unsigned long mstart, mend; >> 512 >> 513 mstart = image->segment[i].mem; >> 514 mend = mstart + image->segment[i].memsz - 1; >> 515 if ((hole_end >= mstart) && (hole_start <= mend)) { >> 516 /* Advance the hole to the end of the segment */ >> 517 hole_start = (mend + (size - 1)) & ~(size - 1); >> 518 hole_end = hole_start + size - 1; >> 519 break; >> 520 } >> 521 } >> 522 /* If I don't overlap any segments I have found my hole! */ >> 523 if (i == image->nr_segments) { >> 524 pages = pfn_to_page(hole_start >> PAGE_SHIFT); >> 525 break; >> 526 } >> 527 } >> 528 if (pages) >> 529 image->control_page = hole_end; 129 530 130 ret = kimage_alloc_init(&image, entry, !! 531 return pages; 131 if (ret) !! 532 } 132 goto out_unlock; << 133 533 134 if (flags & KEXEC_PRESERVE_CONTEXT) << 135 image->preserve_context = 1; << 136 534 137 #ifdef CONFIG_CRASH_HOTPLUG !! 535 struct page *kimage_alloc_control_pages(struct kimage *image, 138 if ((flags & KEXEC_ON_CRASH) && arch_c !! 536 unsigned int order) 139 image->hotplug_support = 1; !! 537 { 140 #endif !! 538 struct page *pages = NULL; 141 539 142 ret = machine_kexec_prepare(image); !! 540 switch (image->type) { 143 if (ret) !! 541 case KEXEC_TYPE_DEFAULT: 144 goto out; !! 542 pages = kimage_alloc_normal_control_pages(image, order); >> 543 break; >> 544 case KEXEC_TYPE_CRASH: >> 545 pages = kimage_alloc_crash_control_pages(image, order); >> 546 break; >> 547 } >> 548 >> 549 return pages; >> 550 } >> 551 >> 552 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) >> 553 { >> 554 if (*image->entry != 0) >> 555 image->entry++; >> 556 >> 557 if (image->entry == image->last_entry) { >> 558 kimage_entry_t *ind_page; >> 559 struct page *page; >> 560 >> 561 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); >> 562 if (!page) >> 563 return -ENOMEM; >> 564 >> 565 ind_page = page_address(page); >> 566 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; >> 567 image->entry = ind_page; >> 568 image->last_entry = ind_page + >> 569 ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); >> 570 } >> 571 *image->entry = entry; >> 572 image->entry++; >> 573 *image->entry = 0; >> 574 >> 575 return 0; >> 576 } >> 577 >> 578 static int kimage_set_destination(struct kimage *image, >> 579 unsigned long destination) >> 580 { >> 581 int result; >> 582 >> 583 destination &= PAGE_MASK; >> 584 result = kimage_add_entry(image, destination | IND_DESTINATION); >> 585 if (result == 0) >> 586 image->destination = destination; >> 587 >> 588 return result; >> 589 } >> 590 >> 591 >> 592 static int kimage_add_page(struct kimage *image, unsigned long page) >> 593 { >> 594 int result; >> 595 >> 596 page &= PAGE_MASK; >> 597 result = kimage_add_entry(image, page | IND_SOURCE); >> 598 if (result == 0) >> 599 image->destination += PAGE_SIZE; >> 600 >> 601 return result; >> 602 } >> 603 >> 604 >> 605 static void kimage_free_extra_pages(struct kimage *image) >> 606 { >> 607 /* Walk through and free any extra destination pages I may have */ >> 608 kimage_free_page_list(&image->dest_pages); >> 609 >> 610 /* Walk through and free any unusable pages I have cached */ >> 611 kimage_free_page_list(&image->unuseable_pages); >> 612 >> 613 } >> 614 static void kimage_terminate(struct kimage *image) >> 615 { >> 616 if (*image->entry != 0) >> 617 image->entry++; >> 618 >> 619 *image->entry = IND_DONE; >> 620 } >> 621 >> 622 #define for_each_kimage_entry(image, ptr, entry) \ >> 623 for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \ >> 624 ptr = (entry & IND_INDIRECTION)? \ >> 625 phys_to_virt((entry & PAGE_MASK)): ptr +1) >> 626 >> 627 static void kimage_free_entry(kimage_entry_t entry) >> 628 { >> 629 struct page *page; >> 630 >> 631 page = pfn_to_page(entry >> PAGE_SHIFT); >> 632 kimage_free_pages(page); >> 633 } >> 634 >> 635 static void kimage_free(struct kimage *image) >> 636 { >> 637 kimage_entry_t *ptr, entry; >> 638 kimage_entry_t ind = 0; >> 639 >> 640 if (!image) >> 641 return; >> 642 >> 643 kimage_free_extra_pages(image); >> 644 for_each_kimage_entry(image, ptr, entry) { >> 645 if (entry & IND_INDIRECTION) { >> 646 /* Free the previous indirection page */ >> 647 if (ind & IND_INDIRECTION) >> 648 kimage_free_entry(ind); >> 649 /* Save this indirection page until we are >> 650 * done with it. >> 651 */ >> 652 ind = entry; >> 653 } >> 654 else if (entry & IND_SOURCE) >> 655 kimage_free_entry(entry); >> 656 } >> 657 /* Free the final indirection page */ >> 658 if (ind & IND_INDIRECTION) >> 659 kimage_free_entry(ind); >> 660 >> 661 /* Handle any machine specific cleanup */ >> 662 machine_kexec_cleanup(image); >> 663 >> 664 /* Free the kexec control pages... */ >> 665 kimage_free_page_list(&image->control_pages); >> 666 kfree(image); >> 667 } >> 668 >> 669 static kimage_entry_t *kimage_dst_used(struct kimage *image, >> 670 unsigned long page) >> 671 { >> 672 kimage_entry_t *ptr, entry; >> 673 unsigned long destination = 0; >> 674 >> 675 for_each_kimage_entry(image, ptr, entry) { >> 676 if (entry & IND_DESTINATION) >> 677 destination = entry & PAGE_MASK; >> 678 else if (entry & IND_SOURCE) { >> 679 if (page == destination) >> 680 return ptr; >> 681 destination += PAGE_SIZE; >> 682 } >> 683 } >> 684 >> 685 return NULL; >> 686 } 145 687 >> 688 static struct page *kimage_alloc_page(struct kimage *image, >> 689 gfp_t gfp_mask, >> 690 unsigned long destination) >> 691 { 146 /* 692 /* 147 * Some architecture(like S390) may to !! 693 * Here we implement safeguards to ensure that a source page 148 * machine_kexec_prepare(), we must co !! 694 * is not copied to its destination page before the data on >> 695 * the destination page is no longer useful. >> 696 * >> 697 * To do this we maintain the invariant that a source page is >> 698 * either its own destination page, or it is not a >> 699 * destination page at all. >> 700 * >> 701 * That is slightly stronger than required, but the proof >> 702 * that no problems will not occur is trivial, and the >> 703 * implementation is simply to verify. >> 704 * >> 705 * When allocating all pages normally this algorithm will run >> 706 * in O(N) time, but in the worst case it will run in O(N^2) >> 707 * time. If the runtime is a problem the data structures can >> 708 * be fixed. 149 */ 709 */ 150 ret = kimage_crash_copy_vmcoreinfo(ima !! 710 struct page *page; 151 if (ret) !! 711 unsigned long addr; 152 goto out; << 153 712 154 for (i = 0; i < nr_segments; i++) { !! 713 /* 155 ret = kimage_load_segment(imag !! 714 * Walk through the list of destination pages, and see if I 156 if (ret) !! 715 * have a match. 157 goto out; !! 716 */ >> 717 list_for_each_entry(page, &image->dest_pages, lru) { >> 718 addr = page_to_pfn(page) << PAGE_SHIFT; >> 719 if (addr == destination) { >> 720 list_del(&page->lru); >> 721 return page; >> 722 } >> 723 } >> 724 page = NULL; >> 725 while (1) { >> 726 kimage_entry_t *old; >> 727 >> 728 /* Allocate a page, if we run out of memory give up */ >> 729 page = kimage_alloc_pages(gfp_mask, 0); >> 730 if (!page) >> 731 return NULL; >> 732 /* If the page cannot be used file it away */ >> 733 if (page_to_pfn(page) > >> 734 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { >> 735 list_add(&page->lru, &image->unuseable_pages); >> 736 continue; >> 737 } >> 738 addr = page_to_pfn(page) << PAGE_SHIFT; >> 739 >> 740 /* If it is the destination page we want use it */ >> 741 if (addr == destination) >> 742 break; >> 743 >> 744 /* If the page is not a destination page use it */ >> 745 if (!kimage_is_destination_range(image, addr, >> 746 addr + PAGE_SIZE)) >> 747 break; >> 748 >> 749 /* >> 750 * I know that the page is someones destination page. >> 751 * See if there is already a source page for this >> 752 * destination page. And if so swap the source pages. >> 753 */ >> 754 old = kimage_dst_used(image, addr); >> 755 if (old) { >> 756 /* If so move it */ >> 757 unsigned long old_addr; >> 758 struct page *old_page; >> 759 >> 760 old_addr = *old & PAGE_MASK; >> 761 old_page = pfn_to_page(old_addr >> PAGE_SHIFT); >> 762 copy_highpage(page, old_page); >> 763 *old = addr | (*old & ~PAGE_MASK); >> 764 >> 765 /* The old page I have found cannot be a >> 766 * destination page, so return it if it's >> 767 * gfp_flags honor the ones passed in. >> 768 */ >> 769 if (!(gfp_mask & __GFP_HIGHMEM) && >> 770 PageHighMem(old_page)) { >> 771 kimage_free_pages(old_page); >> 772 continue; >> 773 } >> 774 addr = old_addr; >> 775 page = old_page; >> 776 break; >> 777 } >> 778 else { >> 779 /* Place the page on the destination list I >> 780 * will use it later. >> 781 */ >> 782 list_add(&page->lru, &image->dest_pages); >> 783 } 158 } 784 } 159 785 160 kimage_terminate(image); !! 786 return page; >> 787 } >> 788 >> 789 static int kimage_load_normal_segment(struct kimage *image, >> 790 struct kexec_segment *segment) >> 791 { >> 792 unsigned long maddr; >> 793 size_t ubytes, mbytes; >> 794 int result; >> 795 unsigned char __user *buf; >> 796 >> 797 result = 0; >> 798 buf = segment->buf; >> 799 ubytes = segment->bufsz; >> 800 mbytes = segment->memsz; >> 801 maddr = segment->mem; 161 802 162 ret = machine_kexec_post_load(image); !! 803 result = kimage_set_destination(image, maddr); 163 if (ret) !! 804 if (result < 0) 164 goto out; 805 goto out; 165 806 166 /* Install the new kernel and uninstal !! 807 while (mbytes) { 167 image = xchg(dest_image, image); !! 808 struct page *page; >> 809 char *ptr; >> 810 size_t uchunk, mchunk; >> 811 >> 812 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); >> 813 if (!page) { >> 814 result = -ENOMEM; >> 815 goto out; >> 816 } >> 817 result = kimage_add_page(image, page_to_pfn(page) >> 818 << PAGE_SHIFT); >> 819 if (result < 0) >> 820 goto out; 168 821 >> 822 ptr = kmap(page); >> 823 /* Start with a clear page */ >> 824 clear_page(ptr); >> 825 ptr += maddr & ~PAGE_MASK; >> 826 mchunk = min_t(size_t, mbytes, >> 827 PAGE_SIZE - (maddr & ~PAGE_MASK)); >> 828 uchunk = min(ubytes, mchunk); >> 829 >> 830 result = copy_from_user(ptr, buf, uchunk); >> 831 kunmap(page); >> 832 if (result) { >> 833 result = -EFAULT; >> 834 goto out; >> 835 } >> 836 ubytes -= uchunk; >> 837 maddr += mchunk; >> 838 buf += mchunk; >> 839 mbytes -= mchunk; >> 840 } 169 out: 841 out: 170 #ifdef CONFIG_CRASH_DUMP !! 842 return result; 171 if ((flags & KEXEC_ON_CRASH) && kexec_ !! 843 } 172 arch_kexec_protect_crashkres() << 173 #endif << 174 844 175 kimage_free(image); !! 845 static int kimage_load_crash_segment(struct kimage *image, 176 out_unlock: !! 846 struct kexec_segment *segment) 177 kexec_unlock(); !! 847 { 178 return ret; !! 848 /* For crash dumps kernels we simply copy the data from >> 849 * user space to it's destination. >> 850 * We do things a page at a time for the sake of kmap. >> 851 */ >> 852 unsigned long maddr; >> 853 size_t ubytes, mbytes; >> 854 int result; >> 855 unsigned char __user *buf; >> 856 >> 857 result = 0; >> 858 buf = segment->buf; >> 859 ubytes = segment->bufsz; >> 860 mbytes = segment->memsz; >> 861 maddr = segment->mem; >> 862 while (mbytes) { >> 863 struct page *page; >> 864 char *ptr; >> 865 size_t uchunk, mchunk; >> 866 >> 867 page = pfn_to_page(maddr >> PAGE_SHIFT); >> 868 if (!page) { >> 869 result = -ENOMEM; >> 870 goto out; >> 871 } >> 872 ptr = kmap(page); >> 873 ptr += maddr & ~PAGE_MASK; >> 874 mchunk = min_t(size_t, mbytes, >> 875 PAGE_SIZE - (maddr & ~PAGE_MASK)); >> 876 uchunk = min(ubytes, mchunk); >> 877 if (mchunk > uchunk) { >> 878 /* Zero the trailing part of the page */ >> 879 memset(ptr + uchunk, 0, mchunk - uchunk); >> 880 } >> 881 result = copy_from_user(ptr, buf, uchunk); >> 882 kexec_flush_icache_page(page); >> 883 kunmap(page); >> 884 if (result) { >> 885 result = -EFAULT; >> 886 goto out; >> 887 } >> 888 ubytes -= uchunk; >> 889 maddr += mchunk; >> 890 buf += mchunk; >> 891 mbytes -= mchunk; >> 892 } >> 893 out: >> 894 return result; >> 895 } >> 896 >> 897 static int kimage_load_segment(struct kimage *image, >> 898 struct kexec_segment *segment) >> 899 { >> 900 int result = -ENOMEM; >> 901 >> 902 switch (image->type) { >> 903 case KEXEC_TYPE_DEFAULT: >> 904 result = kimage_load_normal_segment(image, segment); >> 905 break; >> 906 case KEXEC_TYPE_CRASH: >> 907 result = kimage_load_crash_segment(image, segment); >> 908 break; >> 909 } >> 910 >> 911 return result; 179 } 912 } 180 913 181 /* 914 /* 182 * Exec Kernel system call: for obvious reason 915 * Exec Kernel system call: for obvious reasons only root may call it. 183 * 916 * 184 * This call breaks up into three pieces. 917 * This call breaks up into three pieces. 185 * - A generic part which loads the new kernel 918 * - A generic part which loads the new kernel from the current 186 * address space, and very carefully places 919 * address space, and very carefully places the data in the 187 * allocated pages. 920 * allocated pages. 188 * 921 * 189 * - A generic part that interacts with the ke 922 * - A generic part that interacts with the kernel and tells all of 190 * the devices to shut down. Preventing on- 923 * the devices to shut down. Preventing on-going dmas, and placing 191 * the devices in a consistent state so a la 924 * the devices in a consistent state so a later kernel can 192 * reinitialize them. 925 * reinitialize them. 193 * 926 * 194 * - A machine specific part that includes the 927 * - A machine specific part that includes the syscall number 195 * and then copies the image to it's final d !! 928 * and the copies the image to it's final destination. And 196 * jumps into the image at entry. 929 * jumps into the image at entry. 197 * 930 * 198 * kexec does not sync, or unmount filesystems 931 * kexec does not sync, or unmount filesystems so if you need 199 * that to happen you need to do that yourself 932 * that to happen you need to do that yourself. 200 */ 933 */ >> 934 struct kimage *kexec_image; >> 935 struct kimage *kexec_crash_image; >> 936 >> 937 static DEFINE_MUTEX(kexec_mutex); 201 938 202 static inline int kexec_load_check(unsigned lo !! 939 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, 203 unsigned lo !! 940 struct kexec_segment __user *, segments, unsigned long, flags) 204 { 941 { 205 int image_type = (flags & KEXEC_ON_CRA !! 942 struct kimage **dest_image, *image; 206 KEXEC_TYPE_CRASH : KE << 207 int result; 943 int result; 208 944 209 /* We only trust the superuser with re 945 /* We only trust the superuser with rebooting the system. */ 210 if (!kexec_load_permitted(image_type)) !! 946 if (!capable(CAP_SYS_BOOT)) 211 return -EPERM; 947 return -EPERM; 212 if (!ccs_capable(CCS_SYS_KEXEC_LOAD)) 948 if (!ccs_capable(CCS_SYS_KEXEC_LOAD)) 213 return -EPERM; 949 return -EPERM; 214 950 215 /* Permit LSMs and IMA to fail the kex << 216 result = security_kernel_load_data(LOA << 217 if (result < 0) << 218 return result; << 219 << 220 /* << 221 * kexec can be used to circumvent mod << 222 * prevent loading in that case << 223 */ << 224 result = security_locked_down(LOCKDOWN << 225 if (result) << 226 return result; << 227 << 228 /* 951 /* 229 * Verify we have a legal set of flags 952 * Verify we have a legal set of flags 230 * This leaves us room for future exte 953 * This leaves us room for future extensions. 231 */ 954 */ 232 if ((flags & KEXEC_FLAGS) != (flags & 955 if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK)) 233 return -EINVAL; 956 return -EINVAL; 234 957 >> 958 /* Verify we are on the appropriate architecture */ >> 959 if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && >> 960 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) >> 961 return -EINVAL; >> 962 235 /* Put an artificial cap on the number 963 /* Put an artificial cap on the number 236 * of segments passed to kexec_load. 964 * of segments passed to kexec_load. 237 */ 965 */ 238 if (nr_segments > KEXEC_SEGMENT_MAX) 966 if (nr_segments > KEXEC_SEGMENT_MAX) 239 return -EINVAL; 967 return -EINVAL; 240 968 241 return 0; !! 969 image = NULL; 242 } !! 970 result = 0; 243 971 244 SYSCALL_DEFINE4(kexec_load, unsigned long, ent !! 972 /* Because we write directly to the reserved memory 245 struct kexec_segment __user *, !! 973 * region when loading crash kernels we need a mutex here to 246 { !! 974 * prevent multiple crash kernels from attempting to load 247 struct kexec_segment *ksegments; !! 975 * simultaneously, and to prevent a crash kernel from loading 248 unsigned long result; !! 976 * over the top of a in use crash kernel. >> 977 * >> 978 * KISS: always take the mutex. >> 979 */ >> 980 if (!mutex_trylock(&kexec_mutex)) >> 981 return -EBUSY; 249 982 250 result = kexec_load_check(nr_segments, !! 983 dest_image = &kexec_image; 251 if (result) !! 984 if (flags & KEXEC_ON_CRASH) 252 return result; !! 985 dest_image = &kexec_crash_image; >> 986 if (nr_segments > 0) { >> 987 unsigned long i; 253 988 254 /* Verify we are on the appropriate ar !! 989 /* Loading another kernel to reboot into */ 255 if (((flags & KEXEC_ARCH_MASK) != KEXE !! 990 if ((flags & KEXEC_ON_CRASH) == 0) 256 ((flags & KEXEC_ARCH_MASK) != !! 991 result = kimage_normal_alloc(&image, entry, 257 return -EINVAL; !! 992 nr_segments, segments); >> 993 /* Loading another kernel to switch to if this one crashes */ >> 994 else if (flags & KEXEC_ON_CRASH) { >> 995 /* Free any current crash dump kernel before >> 996 * we corrupt it. >> 997 */ >> 998 kimage_free(xchg(&kexec_crash_image, NULL)); >> 999 result = kimage_crash_alloc(&image, entry, >> 1000 nr_segments, segments); >> 1001 crash_map_reserved_pages(); >> 1002 } >> 1003 if (result) >> 1004 goto out; >> 1005 >> 1006 if (flags & KEXEC_PRESERVE_CONTEXT) >> 1007 image->preserve_context = 1; >> 1008 result = machine_kexec_prepare(image); >> 1009 if (result) >> 1010 goto out; 258 1011 259 ksegments = memdup_array_user(segments !! 1012 for (i = 0; i < nr_segments; i++) { 260 if (IS_ERR(ksegments)) !! 1013 result = kimage_load_segment(image, &image->segment[i]); 261 return PTR_ERR(ksegments); !! 1014 if (result) >> 1015 goto out; >> 1016 } >> 1017 kimage_terminate(image); >> 1018 if (flags & KEXEC_ON_CRASH) >> 1019 crash_unmap_reserved_pages(); >> 1020 } >> 1021 /* Install the new kernel, and Uninstall the old */ >> 1022 image = xchg(dest_image, image); 262 1023 263 result = do_kexec_load(entry, nr_segme !! 1024 out: 264 kfree(ksegments); !! 1025 mutex_unlock(&kexec_mutex); >> 1026 kimage_free(image); 265 1027 266 return result; 1028 return result; 267 } 1029 } 268 1030 >> 1031 /* >> 1032 * Add and remove page tables for crashkernel memory >> 1033 * >> 1034 * Provide an empty default implementation here -- architecture >> 1035 * code may override this >> 1036 */ >> 1037 void __weak crash_map_reserved_pages(void) >> 1038 {} >> 1039 >> 1040 void __weak crash_unmap_reserved_pages(void) >> 1041 {} >> 1042 269 #ifdef CONFIG_COMPAT 1043 #ifdef CONFIG_COMPAT 270 COMPAT_SYSCALL_DEFINE4(kexec_load, compat_ulon !! 1044 asmlinkage long compat_sys_kexec_load(unsigned long entry, 271 compat_ulong_t, nr_segm !! 1045 unsigned long nr_segments, 272 struct compat_kexec_seg !! 1046 struct compat_kexec_segment __user *segments, 273 compat_ulong_t, flags) !! 1047 unsigned long flags) 274 { 1048 { 275 struct compat_kexec_segment in; 1049 struct compat_kexec_segment in; 276 struct kexec_segment *ksegments; !! 1050 struct kexec_segment out, __user *ksegments; 277 unsigned long i, result; 1051 unsigned long i, result; 278 1052 279 result = kexec_load_check(nr_segments, << 280 if (result) << 281 return result; << 282 << 283 /* Don't allow clients that don't unde 1053 /* Don't allow clients that don't understand the native 284 * architecture to do anything. 1054 * architecture to do anything. 285 */ 1055 */ 286 if ((flags & KEXEC_ARCH_MASK) == KEXEC 1056 if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) 287 return -EINVAL; 1057 return -EINVAL; 288 1058 289 ksegments = kmalloc_array(nr_segments, !! 1059 if (nr_segments > KEXEC_SEGMENT_MAX) 290 GFP_KERNEL); !! 1060 return -EINVAL; 291 if (!ksegments) << 292 return -ENOMEM; << 293 1061 294 for (i = 0; i < nr_segments; i++) { !! 1062 ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); >> 1063 for (i=0; i < nr_segments; i++) { 295 result = copy_from_user(&in, & 1064 result = copy_from_user(&in, &segments[i], sizeof(in)); 296 if (result) 1065 if (result) 297 goto fail; !! 1066 return -EFAULT; >> 1067 >> 1068 out.buf = compat_ptr(in.buf); >> 1069 out.bufsz = in.bufsz; >> 1070 out.mem = in.mem; >> 1071 out.memsz = in.memsz; 298 1072 299 ksegments[i].buf = compat_pt !! 1073 result = copy_to_user(&ksegments[i], &out, sizeof(out)); 300 ksegments[i].bufsz = in.bufsz; !! 1074 if (result) 301 ksegments[i].mem = in.mem; !! 1075 return -EFAULT; 302 ksegments[i].memsz = in.memsz; << 303 } 1076 } 304 1077 305 result = do_kexec_load(entry, nr_segme !! 1078 return sys_kexec_load(entry, nr_segments, ksegments, flags); >> 1079 } >> 1080 #endif 306 1081 307 fail: !! 1082 void crash_kexec(struct pt_regs *regs) 308 kfree(ksegments); !! 1083 { 309 return result; !! 1084 /* Take the kexec_mutex here to prevent sys_kexec_load >> 1085 * running on one cpu from replacing the crash kernel >> 1086 * we are using after a panic on a different cpu. >> 1087 * >> 1088 * If the crash kernel was not located in a fixed area >> 1089 * of memory the xchg(&kexec_crash_image) would be >> 1090 * sufficient. But since I reuse the memory... >> 1091 */ >> 1092 if (mutex_trylock(&kexec_mutex)) { >> 1093 if (kexec_crash_image) { >> 1094 struct pt_regs fixed_regs; >> 1095 >> 1096 crash_setup_regs(&fixed_regs, regs); >> 1097 crash_save_vmcoreinfo(); >> 1098 machine_crash_shutdown(&fixed_regs); >> 1099 machine_kexec(kexec_crash_image); >> 1100 } >> 1101 mutex_unlock(&kexec_mutex); >> 1102 } >> 1103 } >> 1104 >> 1105 size_t crash_get_memory_size(void) >> 1106 { >> 1107 size_t size = 0; >> 1108 mutex_lock(&kexec_mutex); >> 1109 if (crashk_res.end != crashk_res.start) >> 1110 size = resource_size(&crashk_res); >> 1111 mutex_unlock(&kexec_mutex); >> 1112 return size; >> 1113 } >> 1114 >> 1115 void __weak crash_free_reserved_phys_range(unsigned long begin, >> 1116 unsigned long end) >> 1117 { >> 1118 unsigned long addr; >> 1119 >> 1120 for (addr = begin; addr < end; addr += PAGE_SIZE) >> 1121 free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); >> 1122 } >> 1123 >> 1124 int crash_shrink_memory(unsigned long new_size) >> 1125 { >> 1126 int ret = 0; >> 1127 unsigned long start, end; >> 1128 unsigned long old_size; >> 1129 struct resource *ram_res; >> 1130 >> 1131 mutex_lock(&kexec_mutex); >> 1132 >> 1133 if (kexec_crash_image) { >> 1134 ret = -ENOENT; >> 1135 goto unlock; >> 1136 } >> 1137 start = crashk_res.start; >> 1138 end = crashk_res.end; >> 1139 old_size = (end == 0) ? 0 : end - start + 1; >> 1140 if (new_size >= old_size) { >> 1141 ret = (new_size == old_size) ? 0 : -EINVAL; >> 1142 goto unlock; >> 1143 } >> 1144 >> 1145 ram_res = kzalloc(sizeof(*ram_res), GFP_KERNEL); >> 1146 if (!ram_res) { >> 1147 ret = -ENOMEM; >> 1148 goto unlock; >> 1149 } >> 1150 >> 1151 start = roundup(start, KEXEC_CRASH_MEM_ALIGN); >> 1152 end = roundup(start + new_size, KEXEC_CRASH_MEM_ALIGN); >> 1153 >> 1154 crash_map_reserved_pages(); >> 1155 crash_free_reserved_phys_range(end, crashk_res.end); >> 1156 >> 1157 if ((start == end) && (crashk_res.parent != NULL)) >> 1158 release_resource(&crashk_res); >> 1159 >> 1160 ram_res->start = end; >> 1161 ram_res->end = crashk_res.end; >> 1162 ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; >> 1163 ram_res->name = "System RAM"; >> 1164 >> 1165 crashk_res.end = end - 1; >> 1166 >> 1167 insert_resource(&iomem_resource, ram_res); >> 1168 crash_unmap_reserved_pages(); >> 1169 >> 1170 unlock: >> 1171 mutex_unlock(&kexec_mutex); >> 1172 return ret; >> 1173 } >> 1174 >> 1175 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, >> 1176 size_t data_len) >> 1177 { >> 1178 struct elf_note note; >> 1179 >> 1180 note.n_namesz = strlen(name) + 1; >> 1181 note.n_descsz = data_len; >> 1182 note.n_type = type; >> 1183 memcpy(buf, ¬e, sizeof(note)); >> 1184 buf += (sizeof(note) + 3)/4; >> 1185 memcpy(buf, name, note.n_namesz); >> 1186 buf += (note.n_namesz + 3)/4; >> 1187 memcpy(buf, data, note.n_descsz); >> 1188 buf += (note.n_descsz + 3)/4; >> 1189 >> 1190 return buf; >> 1191 } >> 1192 >> 1193 static void final_note(u32 *buf) >> 1194 { >> 1195 struct elf_note note; >> 1196 >> 1197 note.n_namesz = 0; >> 1198 note.n_descsz = 0; >> 1199 note.n_type = 0; >> 1200 memcpy(buf, ¬e, sizeof(note)); >> 1201 } >> 1202 >> 1203 void crash_save_cpu(struct pt_regs *regs, int cpu) >> 1204 { >> 1205 struct elf_prstatus prstatus; >> 1206 u32 *buf; >> 1207 >> 1208 if ((cpu < 0) || (cpu >= nr_cpu_ids)) >> 1209 return; >> 1210 >> 1211 /* Using ELF notes here is opportunistic. >> 1212 * I need a well defined structure format >> 1213 * for the data I pass, and I need tags >> 1214 * on the data to indicate what information I have >> 1215 * squirrelled away. ELF notes happen to provide >> 1216 * all of that, so there is no need to invent something new. >> 1217 */ >> 1218 buf = (u32*)per_cpu_ptr(crash_notes, cpu); >> 1219 if (!buf) >> 1220 return; >> 1221 memset(&prstatus, 0, sizeof(prstatus)); >> 1222 prstatus.pr_pid = current->pid; >> 1223 elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); >> 1224 buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, >> 1225 &prstatus, sizeof(prstatus)); >> 1226 final_note(buf); >> 1227 } >> 1228 >> 1229 static int __init crash_notes_memory_init(void) >> 1230 { >> 1231 /* Allocate memory for saving cpu registers. */ >> 1232 crash_notes = alloc_percpu(note_buf_t); >> 1233 if (!crash_notes) { >> 1234 printk("Kexec: Memory allocation for saving cpu register" >> 1235 " states failed\n"); >> 1236 return -ENOMEM; >> 1237 } >> 1238 return 0; 310 } 1239 } >> 1240 module_init(crash_notes_memory_init) >> 1241 >> 1242 >> 1243 /* >> 1244 * parsing the "crashkernel" commandline >> 1245 * >> 1246 * this code is intended to be called from architecture specific code >> 1247 */ >> 1248 >> 1249 >> 1250 /* >> 1251 * This function parses command lines in the format >> 1252 * >> 1253 * crashkernel=ramsize-range:size[,...][@offset] >> 1254 * >> 1255 * The function returns 0 on success and -EINVAL on failure. >> 1256 */ >> 1257 static int __init parse_crashkernel_mem(char *cmdline, >> 1258 unsigned long long system_ram, >> 1259 unsigned long long *crash_size, >> 1260 unsigned long long *crash_base) >> 1261 { >> 1262 char *cur = cmdline, *tmp; >> 1263 >> 1264 /* for each entry of the comma-separated list */ >> 1265 do { >> 1266 unsigned long long start, end = ULLONG_MAX, size; >> 1267 >> 1268 /* get the start of the range */ >> 1269 start = memparse(cur, &tmp); >> 1270 if (cur == tmp) { >> 1271 pr_warning("crashkernel: Memory value expected\n"); >> 1272 return -EINVAL; >> 1273 } >> 1274 cur = tmp; >> 1275 if (*cur != '-') { >> 1276 pr_warning("crashkernel: '-' expected\n"); >> 1277 return -EINVAL; >> 1278 } >> 1279 cur++; >> 1280 >> 1281 /* if no ':' is here, than we read the end */ >> 1282 if (*cur != ':') { >> 1283 end = memparse(cur, &tmp); >> 1284 if (cur == tmp) { >> 1285 pr_warning("crashkernel: Memory " >> 1286 "value expected\n"); >> 1287 return -EINVAL; >> 1288 } >> 1289 cur = tmp; >> 1290 if (end <= start) { >> 1291 pr_warning("crashkernel: end <= start\n"); >> 1292 return -EINVAL; >> 1293 } >> 1294 } >> 1295 >> 1296 if (*cur != ':') { >> 1297 pr_warning("crashkernel: ':' expected\n"); >> 1298 return -EINVAL; >> 1299 } >> 1300 cur++; >> 1301 >> 1302 size = memparse(cur, &tmp); >> 1303 if (cur == tmp) { >> 1304 pr_warning("Memory value expected\n"); >> 1305 return -EINVAL; >> 1306 } >> 1307 cur = tmp; >> 1308 if (size >= system_ram) { >> 1309 pr_warning("crashkernel: invalid size\n"); >> 1310 return -EINVAL; >> 1311 } >> 1312 >> 1313 /* match ? */ >> 1314 if (system_ram >= start && system_ram < end) { >> 1315 *crash_size = size; >> 1316 break; >> 1317 } >> 1318 } while (*cur++ == ','); >> 1319 >> 1320 if (*crash_size > 0) { >> 1321 while (*cur && *cur != ' ' && *cur != '@') >> 1322 cur++; >> 1323 if (*cur == '@') { >> 1324 cur++; >> 1325 *crash_base = memparse(cur, &tmp); >> 1326 if (cur == tmp) { >> 1327 pr_warning("Memory value expected " >> 1328 "after '@'\n"); >> 1329 return -EINVAL; >> 1330 } >> 1331 } >> 1332 } >> 1333 >> 1334 return 0; >> 1335 } >> 1336 >> 1337 /* >> 1338 * That function parses "simple" (old) crashkernel command lines like >> 1339 * >> 1340 * crashkernel=size[@offset] >> 1341 * >> 1342 * It returns 0 on success and -EINVAL on failure. >> 1343 */ >> 1344 static int __init parse_crashkernel_simple(char *cmdline, >> 1345 unsigned long long *crash_size, >> 1346 unsigned long long *crash_base) >> 1347 { >> 1348 char *cur = cmdline; >> 1349 >> 1350 *crash_size = memparse(cmdline, &cur); >> 1351 if (cmdline == cur) { >> 1352 pr_warning("crashkernel: memory value expected\n"); >> 1353 return -EINVAL; >> 1354 } >> 1355 >> 1356 if (*cur == '@') >> 1357 *crash_base = memparse(cur+1, &cur); >> 1358 else if (*cur != ' ' && *cur != '\0') { >> 1359 pr_warning("crashkernel: unrecognized char\n"); >> 1360 return -EINVAL; >> 1361 } >> 1362 >> 1363 return 0; >> 1364 } >> 1365 >> 1366 #define SUFFIX_HIGH 0 >> 1367 #define SUFFIX_LOW 1 >> 1368 #define SUFFIX_NULL 2 >> 1369 static __initdata char *suffix_tbl[] = { >> 1370 [SUFFIX_HIGH] = ",high", >> 1371 [SUFFIX_LOW] = ",low", >> 1372 [SUFFIX_NULL] = NULL, >> 1373 }; >> 1374 >> 1375 /* >> 1376 * That function parses "suffix" crashkernel command lines like >> 1377 * >> 1378 * crashkernel=size,[high|low] >> 1379 * >> 1380 * It returns 0 on success and -EINVAL on failure. >> 1381 */ >> 1382 static int __init parse_crashkernel_suffix(char *cmdline, >> 1383 unsigned long long *crash_size, >> 1384 unsigned long long *crash_base, >> 1385 const char *suffix) >> 1386 { >> 1387 char *cur = cmdline; >> 1388 >> 1389 *crash_size = memparse(cmdline, &cur); >> 1390 if (cmdline == cur) { >> 1391 pr_warn("crashkernel: memory value expected\n"); >> 1392 return -EINVAL; >> 1393 } >> 1394 >> 1395 /* check with suffix */ >> 1396 if (strncmp(cur, suffix, strlen(suffix))) { >> 1397 pr_warn("crashkernel: unrecognized char\n"); >> 1398 return -EINVAL; >> 1399 } >> 1400 cur += strlen(suffix); >> 1401 if (*cur != ' ' && *cur != '\0') { >> 1402 pr_warn("crashkernel: unrecognized char\n"); >> 1403 return -EINVAL; >> 1404 } >> 1405 >> 1406 return 0; >> 1407 } >> 1408 >> 1409 static __init char *get_last_crashkernel(char *cmdline, >> 1410 const char *name, >> 1411 const char *suffix) >> 1412 { >> 1413 char *p = cmdline, *ck_cmdline = NULL; >> 1414 >> 1415 /* find crashkernel and use the last one if there are more */ >> 1416 p = strstr(p, name); >> 1417 while (p) { >> 1418 char *end_p = strchr(p, ' '); >> 1419 char *q; >> 1420 >> 1421 if (!end_p) >> 1422 end_p = p + strlen(p); >> 1423 >> 1424 if (!suffix) { >> 1425 int i; >> 1426 >> 1427 /* skip the one with any known suffix */ >> 1428 for (i = 0; suffix_tbl[i]; i++) { >> 1429 q = end_p - strlen(suffix_tbl[i]); >> 1430 if (!strncmp(q, suffix_tbl[i], >> 1431 strlen(suffix_tbl[i]))) >> 1432 goto next; >> 1433 } >> 1434 ck_cmdline = p; >> 1435 } else { >> 1436 q = end_p - strlen(suffix); >> 1437 if (!strncmp(q, suffix, strlen(suffix))) >> 1438 ck_cmdline = p; >> 1439 } >> 1440 next: >> 1441 p = strstr(p+1, name); >> 1442 } >> 1443 >> 1444 if (!ck_cmdline) >> 1445 return NULL; >> 1446 >> 1447 return ck_cmdline; >> 1448 } >> 1449 >> 1450 static int __init __parse_crashkernel(char *cmdline, >> 1451 unsigned long long system_ram, >> 1452 unsigned long long *crash_size, >> 1453 unsigned long long *crash_base, >> 1454 const char *name, >> 1455 const char *suffix) >> 1456 { >> 1457 char *first_colon, *first_space; >> 1458 char *ck_cmdline; >> 1459 >> 1460 BUG_ON(!crash_size || !crash_base); >> 1461 *crash_size = 0; >> 1462 *crash_base = 0; >> 1463 >> 1464 ck_cmdline = get_last_crashkernel(cmdline, name, suffix); >> 1465 >> 1466 if (!ck_cmdline) >> 1467 return -EINVAL; >> 1468 >> 1469 ck_cmdline += strlen(name); >> 1470 >> 1471 if (suffix) >> 1472 return parse_crashkernel_suffix(ck_cmdline, crash_size, >> 1473 crash_base, suffix); >> 1474 /* >> 1475 * if the commandline contains a ':', then that's the extended >> 1476 * syntax -- if not, it must be the classic syntax >> 1477 */ >> 1478 first_colon = strchr(ck_cmdline, ':'); >> 1479 first_space = strchr(ck_cmdline, ' '); >> 1480 if (first_colon && (!first_space || first_colon < first_space)) >> 1481 return parse_crashkernel_mem(ck_cmdline, system_ram, >> 1482 crash_size, crash_base); >> 1483 else >> 1484 return parse_crashkernel_simple(ck_cmdline, crash_size, >> 1485 crash_base); >> 1486 >> 1487 return 0; >> 1488 } >> 1489 >> 1490 /* >> 1491 * That function is the entry point for command line parsing and should be >> 1492 * called from the arch-specific code. >> 1493 */ >> 1494 int __init parse_crashkernel(char *cmdline, >> 1495 unsigned long long system_ram, >> 1496 unsigned long long *crash_size, >> 1497 unsigned long long *crash_base) >> 1498 { >> 1499 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, >> 1500 "crashkernel=", NULL); >> 1501 } >> 1502 >> 1503 int __init parse_crashkernel_high(char *cmdline, >> 1504 unsigned long long system_ram, >> 1505 unsigned long long *crash_size, >> 1506 unsigned long long *crash_base) >> 1507 { >> 1508 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, >> 1509 "crashkernel=", suffix_tbl[SUFFIX_HIGH]); >> 1510 } >> 1511 >> 1512 int __init parse_crashkernel_low(char *cmdline, >> 1513 unsigned long long system_ram, >> 1514 unsigned long long *crash_size, >> 1515 unsigned long long *crash_base) >> 1516 { >> 1517 return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, >> 1518 "crashkernel=", suffix_tbl[SUFFIX_LOW]); >> 1519 } >> 1520 >> 1521 static void update_vmcoreinfo_note(void) >> 1522 { >> 1523 u32 *buf = vmcoreinfo_note; >> 1524 >> 1525 if (!vmcoreinfo_size) >> 1526 return; >> 1527 buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, >> 1528 vmcoreinfo_size); >> 1529 final_note(buf); >> 1530 } >> 1531 >> 1532 void crash_save_vmcoreinfo(void) >> 1533 { >> 1534 vmcoreinfo_append_str("CRASHTIME=%ld\n", get_seconds()); >> 1535 update_vmcoreinfo_note(); >> 1536 } >> 1537 >> 1538 void vmcoreinfo_append_str(const char *fmt, ...) >> 1539 { >> 1540 va_list args; >> 1541 char buf[0x50]; >> 1542 size_t r; >> 1543 >> 1544 va_start(args, fmt); >> 1545 r = vsnprintf(buf, sizeof(buf), fmt, args); >> 1546 va_end(args); >> 1547 >> 1548 r = min(r, vmcoreinfo_max_size - vmcoreinfo_size); >> 1549 >> 1550 memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); >> 1551 >> 1552 vmcoreinfo_size += r; >> 1553 } >> 1554 >> 1555 /* >> 1556 * provide an empty default implementation here -- architecture >> 1557 * code may override this >> 1558 */ >> 1559 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) >> 1560 {} >> 1561 >> 1562 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) >> 1563 { >> 1564 return __pa((unsigned long)(char *)&vmcoreinfo_note); >> 1565 } >> 1566 >> 1567 static int __init crash_save_vmcoreinfo_init(void) >> 1568 { >> 1569 VMCOREINFO_OSRELEASE(init_uts_ns.name.release); >> 1570 VMCOREINFO_PAGESIZE(PAGE_SIZE); >> 1571 >> 1572 VMCOREINFO_SYMBOL(init_uts_ns); >> 1573 VMCOREINFO_SYMBOL(node_online_map); >> 1574 #ifdef CONFIG_MMU >> 1575 VMCOREINFO_SYMBOL(swapper_pg_dir); >> 1576 #endif >> 1577 VMCOREINFO_SYMBOL(_stext); >> 1578 VMCOREINFO_SYMBOL(vmap_area_list); >> 1579 >> 1580 #ifndef CONFIG_NEED_MULTIPLE_NODES >> 1581 VMCOREINFO_SYMBOL(mem_map); >> 1582 VMCOREINFO_SYMBOL(contig_page_data); >> 1583 #endif >> 1584 #ifdef CONFIG_SPARSEMEM >> 1585 VMCOREINFO_SYMBOL(mem_section); >> 1586 VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); >> 1587 VMCOREINFO_STRUCT_SIZE(mem_section); >> 1588 VMCOREINFO_OFFSET(mem_section, section_mem_map); >> 1589 #endif >> 1590 VMCOREINFO_STRUCT_SIZE(page); >> 1591 VMCOREINFO_STRUCT_SIZE(pglist_data); >> 1592 VMCOREINFO_STRUCT_SIZE(zone); >> 1593 VMCOREINFO_STRUCT_SIZE(free_area); >> 1594 VMCOREINFO_STRUCT_SIZE(list_head); >> 1595 VMCOREINFO_SIZE(nodemask_t); >> 1596 VMCOREINFO_OFFSET(page, flags); >> 1597 VMCOREINFO_OFFSET(page, _count); >> 1598 VMCOREINFO_OFFSET(page, mapping); >> 1599 VMCOREINFO_OFFSET(page, lru); >> 1600 VMCOREINFO_OFFSET(page, _mapcount); >> 1601 VMCOREINFO_OFFSET(page, private); >> 1602 VMCOREINFO_OFFSET(pglist_data, node_zones); >> 1603 VMCOREINFO_OFFSET(pglist_data, nr_zones); >> 1604 #ifdef CONFIG_FLAT_NODE_MEM_MAP >> 1605 VMCOREINFO_OFFSET(pglist_data, node_mem_map); >> 1606 #endif >> 1607 VMCOREINFO_OFFSET(pglist_data, node_start_pfn); >> 1608 VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); >> 1609 VMCOREINFO_OFFSET(pglist_data, node_id); >> 1610 VMCOREINFO_OFFSET(zone, free_area); >> 1611 VMCOREINFO_OFFSET(zone, vm_stat); >> 1612 VMCOREINFO_OFFSET(zone, spanned_pages); >> 1613 VMCOREINFO_OFFSET(free_area, free_list); >> 1614 VMCOREINFO_OFFSET(list_head, next); >> 1615 VMCOREINFO_OFFSET(list_head, prev); >> 1616 VMCOREINFO_OFFSET(vmap_area, va_start); >> 1617 VMCOREINFO_OFFSET(vmap_area, list); >> 1618 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); >> 1619 log_buf_kexec_setup(); >> 1620 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); >> 1621 VMCOREINFO_NUMBER(NR_FREE_PAGES); >> 1622 VMCOREINFO_NUMBER(PG_lru); >> 1623 VMCOREINFO_NUMBER(PG_private); >> 1624 VMCOREINFO_NUMBER(PG_swapcache); >> 1625 VMCOREINFO_NUMBER(PG_slab); >> 1626 #ifdef CONFIG_MEMORY_FAILURE >> 1627 VMCOREINFO_NUMBER(PG_hwpoison); >> 1628 #endif >> 1629 VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); >> 1630 >> 1631 arch_crash_save_vmcoreinfo(); >> 1632 update_vmcoreinfo_note(); >> 1633 >> 1634 return 0; >> 1635 } >> 1636 >> 1637 module_init(crash_save_vmcoreinfo_init) >> 1638 >> 1639 /* >> 1640 * Move into place and start executing a preloaded standalone >> 1641 * executable. If nothing was preloaded return an error. >> 1642 */ >> 1643 int kernel_kexec(void) >> 1644 { >> 1645 int error = 0; >> 1646 >> 1647 if (!mutex_trylock(&kexec_mutex)) >> 1648 return -EBUSY; >> 1649 if (!kexec_image) { >> 1650 error = -EINVAL; >> 1651 goto Unlock; >> 1652 } >> 1653 >> 1654 #ifdef CONFIG_KEXEC_JUMP >> 1655 if (kexec_image->preserve_context) { >> 1656 lock_system_sleep(); >> 1657 pm_prepare_console(); >> 1658 error = freeze_processes(); >> 1659 if (error) { >> 1660 error = -EBUSY; >> 1661 goto Restore_console; >> 1662 } >> 1663 suspend_console(); >> 1664 error = dpm_suspend_start(PMSG_FREEZE); >> 1665 if (error) >> 1666 goto Resume_console; >> 1667 /* At this point, dpm_suspend_start() has been called, >> 1668 * but *not* dpm_suspend_end(). We *must* call >> 1669 * dpm_suspend_end() now. Otherwise, drivers for >> 1670 * some devices (e.g. interrupt controllers) become >> 1671 * desynchronized with the actual state of the >> 1672 * hardware at resume time, and evil weirdness ensues. >> 1673 */ >> 1674 error = dpm_suspend_end(PMSG_FREEZE); >> 1675 if (error) >> 1676 goto Resume_devices; >> 1677 error = disable_nonboot_cpus(); >> 1678 if (error) >> 1679 goto Enable_cpus; >> 1680 local_irq_disable(); >> 1681 error = syscore_suspend(); >> 1682 if (error) >> 1683 goto Enable_irqs; >> 1684 } else >> 1685 #endif >> 1686 { >> 1687 kexec_in_progress = true; >> 1688 kernel_restart_prepare(NULL); >> 1689 printk(KERN_EMERG "Starting new kernel\n"); >> 1690 machine_shutdown(); >> 1691 } >> 1692 >> 1693 machine_kexec(kexec_image); >> 1694 >> 1695 #ifdef CONFIG_KEXEC_JUMP >> 1696 if (kexec_image->preserve_context) { >> 1697 syscore_resume(); >> 1698 Enable_irqs: >> 1699 local_irq_enable(); >> 1700 Enable_cpus: >> 1701 enable_nonboot_cpus(); >> 1702 dpm_resume_start(PMSG_RESTORE); >> 1703 Resume_devices: >> 1704 dpm_resume_end(PMSG_RESTORE); >> 1705 Resume_console: >> 1706 resume_console(); >> 1707 thaw_processes(); >> 1708 Restore_console: >> 1709 pm_restore_console(); >> 1710 unlock_system_sleep(); >> 1711 } 311 #endif 1712 #endif >> 1713 >> 1714 Unlock: >> 1715 mutex_unlock(&kexec_mutex); >> 1716 return error; >> 1717 } 312 1718
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.