1 // SPDX-License-Identifier: GPL-2.0 2 /* Support for MMIO probes. 3 * Benefit many code from kprobes 4 * (C) 2002 Louis Zhuang <louis.zhuang@intel.com>. 5 * 2007 Alexander Eichner 6 * 2008 Pekka Paalanen <pq@iki.fi> 7 */ 8 9 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 10 11 #include <linux/list.h> 12 #include <linux/rculist.h> 13 #include <linux/spinlock.h> 14 #include <linux/hash.h> 15 #include <linux/export.h> 16 #include <linux/kernel.h> 17 #include <linux/uaccess.h> 18 #include <linux/ptrace.h> 19 #include <linux/preempt.h> 20 #include <linux/percpu.h> 21 #include <linux/kdebug.h> 22 #include <linux/mutex.h> 23 #include <linux/io.h> 24 #include <linux/slab.h> 25 #include <asm/cacheflush.h> 26 #include <asm/tlbflush.h> 27 #include <linux/errno.h> 28 #include <asm/debugreg.h> 29 #include <linux/mmiotrace.h> 30 31 #define KMMIO_PAGE_HASH_BITS 4 32 #define KMMIO_PAGE_TABLE_SIZE (1 << KMMIO_PAGE_HASH_BITS) 33 34 struct kmmio_fault_page { 35 struct list_head list; 36 struct kmmio_fault_page *release_next; 37 unsigned long addr; /* the requested address */ 38 pteval_t old_presence; /* page presence prior to arming */ 39 bool armed; 40 41 /* 42 * Number of times this page has been registered as a part 43 * of a probe. If zero, page is disarmed and this may be freed. 44 * Used only by writers (RCU) and post_kmmio_handler(). 45 * Protected by kmmio_lock, when linked into kmmio_page_table. 46 */ 47 int count; 48 49 bool scheduled_for_release; 50 }; 51 52 struct kmmio_delayed_release { 53 struct rcu_head rcu; 54 struct kmmio_fault_page *release_list; 55 }; 56 57 struct kmmio_context { 58 struct kmmio_fault_page *fpage; 59 struct kmmio_probe *probe; 60 unsigned long saved_flags; 61 unsigned long addr; 62 int active; 63 }; 64 65 /* 66 * The kmmio_lock is taken in int3 context, which is treated as NMI context. 67 * This causes lockdep to complain about it bein in both NMI and normal 68 * context. Hide it from lockdep, as it should not have any other locks 69 * taken under it, and this is only enabled for debugging mmio anyway. 70 */ 71 static arch_spinlock_t kmmio_lock = __ARCH_SPIN_LOCK_UNLOCKED; 72 73 /* Protected by kmmio_lock */ 74 unsigned int kmmio_count; 75 76 /* Read-protected by RCU, write-protected by kmmio_lock. */ 77 static struct list_head kmmio_page_table[KMMIO_PAGE_TABLE_SIZE]; 78 static LIST_HEAD(kmmio_probes); 79 80 static struct list_head *kmmio_page_list(unsigned long addr) 81 { 82 unsigned int l; 83 pte_t *pte = lookup_address(addr, &l); 84 85 if (!pte) 86 return NULL; 87 addr &= page_level_mask(l); 88 89 return &kmmio_page_table[hash_long(addr, KMMIO_PAGE_HASH_BITS)]; 90 } 91 92 /* Accessed per-cpu */ 93 static DEFINE_PER_CPU(struct kmmio_context, kmmio_ctx); 94 95 /* 96 * this is basically a dynamic stabbing problem: 97 * Could use the existing prio tree code or 98 * Possible better implementations: 99 * The Interval Skip List: A Data Structure for Finding All Intervals That 100 * Overlap a Point (might be simple) 101 * Space Efficient Dynamic Stabbing with Fast Queries - Mikkel Thorup 102 */ 103 /* Get the kmmio at this addr (if any). You must be holding RCU read lock. */ 104 static struct kmmio_probe *get_kmmio_probe(unsigned long addr) 105 { 106 struct kmmio_probe *p; 107 list_for_each_entry_rcu(p, &kmmio_probes, list) { 108 if (addr >= p->addr && addr < (p->addr + p->len)) 109 return p; 110 } 111 return NULL; 112 } 113 114 /* You must be holding RCU read lock. */ 115 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long addr) 116 { 117 struct list_head *head; 118 struct kmmio_fault_page *f; 119 unsigned int l; 120 pte_t *pte = lookup_address(addr, &l); 121 122 if (!pte) 123 return NULL; 124 addr &= page_level_mask(l); 125 head = kmmio_page_list(addr); 126 list_for_each_entry_rcu(f, head, list) { 127 if (f->addr == addr) 128 return f; 129 } 130 return NULL; 131 } 132 133 static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) 134 { 135 pmd_t new_pmd; 136 pmdval_t v = pmd_val(*pmd); 137 if (clear) { 138 *old = v; 139 new_pmd = pmd_mkinvalid(*pmd); 140 } else { 141 /* Presume this has been called with clear==true previously */ 142 new_pmd = __pmd(*old); 143 } 144 set_pmd(pmd, new_pmd); 145 } 146 147 static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) 148 { 149 pteval_t v = pte_val(*pte); 150 if (clear) { 151 *old = v; 152 /* Nothing should care about address */ 153 pte_clear(&init_mm, 0, pte); 154 } else { 155 /* Presume this has been called with clear==true previously */ 156 set_pte_atomic(pte, __pte(*old)); 157 } 158 } 159 160 static int clear_page_presence(struct kmmio_fault_page *f, bool clear) 161 { 162 unsigned int level; 163 pte_t *pte = lookup_address(f->addr, &level); 164 165 if (!pte) { 166 pr_err("no pte for addr 0x%08lx\n", f->addr); 167 return -1; 168 } 169 170 switch (level) { 171 case PG_LEVEL_2M: 172 clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); 173 break; 174 case PG_LEVEL_4K: 175 clear_pte_presence(pte, clear, &f->old_presence); 176 break; 177 default: 178 pr_err("unexpected page level 0x%x.\n", level); 179 return -1; 180 } 181 182 flush_tlb_one_kernel(f->addr); 183 return 0; 184 } 185 186 /* 187 * Mark the given page as not present. Access to it will trigger a fault. 188 * 189 * Struct kmmio_fault_page is protected by RCU and kmmio_lock, but the 190 * protection is ignored here. RCU read lock is assumed held, so the struct 191 * will not disappear unexpectedly. Furthermore, the caller must guarantee, 192 * that double arming the same virtual address (page) cannot occur. 193 * 194 * Double disarming on the other hand is allowed, and may occur when a fault 195 * and mmiotrace shutdown happen simultaneously. 196 */ 197 static int arm_kmmio_fault_page(struct kmmio_fault_page *f) 198 { 199 int ret; 200 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n")); 201 if (f->armed) { 202 pr_warn("double-arm: addr 0x%08lx, ref %d, old %d\n", 203 f->addr, f->count, !!f->old_presence); 204 } 205 ret = clear_page_presence(f, true); 206 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming at 0x%08lx failed.\n"), 207 f->addr); 208 f->armed = true; 209 return ret; 210 } 211 212 /** Restore the given page to saved presence state. */ 213 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) 214 { 215 int ret = clear_page_presence(f, false); 216 WARN_ONCE(ret < 0, 217 KERN_ERR "kmmio disarming at 0x%08lx failed.\n", f->addr); 218 f->armed = false; 219 } 220 221 /* 222 * This is being called from do_page_fault(). 223 * 224 * We may be in an interrupt or a critical section. Also prefecthing may 225 * trigger a page fault. We may be in the middle of process switch. 226 * We cannot take any locks, because we could be executing especially 227 * within a kmmio critical section. 228 * 229 * Local interrupts are disabled, so preemption cannot happen. 230 * Do not enable interrupts, do not sleep, and watch out for other CPUs. 231 */ 232 /* 233 * Interrupts are disabled on entry as trap3 is an interrupt gate 234 * and they remain disabled throughout this function. 235 */ 236 int kmmio_handler(struct pt_regs *regs, unsigned long addr) 237 { 238 struct kmmio_context *ctx; 239 struct kmmio_fault_page *faultpage; 240 int ret = 0; /* default to fault not handled */ 241 unsigned long page_base = addr; 242 unsigned int l; 243 pte_t *pte = lookup_address(addr, &l); 244 if (!pte) 245 return -EINVAL; 246 page_base &= page_level_mask(l); 247 248 /* 249 * Hold the RCU read lock over single stepping to avoid looking 250 * up the probe and kmmio_fault_page again. The rcu_read_lock_sched() 251 * also disables preemption and prevents process switch during 252 * the single stepping. We can only handle one active kmmio trace 253 * per cpu, so ensure that we finish it before something else 254 * gets to run. 255 */ 256 rcu_read_lock_sched_notrace(); 257 258 faultpage = get_kmmio_fault_page(page_base); 259 if (!faultpage) { 260 /* 261 * Either this page fault is not caused by kmmio, or 262 * another CPU just pulled the kmmio probe from under 263 * our feet. The latter case should not be possible. 264 */ 265 goto no_kmmio; 266 } 267 268 ctx = this_cpu_ptr(&kmmio_ctx); 269 if (ctx->active) { 270 if (page_base == ctx->addr) { 271 /* 272 * A second fault on the same page means some other 273 * condition needs handling by do_page_fault(), the 274 * page really not being present is the most common. 275 */ 276 pr_debug("secondary hit for 0x%08lx CPU %d.\n", 277 addr, smp_processor_id()); 278 279 if (!faultpage->old_presence) 280 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n", 281 addr, smp_processor_id()); 282 } else { 283 /* 284 * Prevent overwriting already in-flight context. 285 * This should not happen, let's hope disarming at 286 * least prevents a panic. 287 */ 288 pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n", 289 smp_processor_id(), addr); 290 pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr); 291 disarm_kmmio_fault_page(faultpage); 292 } 293 goto no_kmmio; 294 } 295 ctx->active++; 296 297 ctx->fpage = faultpage; 298 ctx->probe = get_kmmio_probe(page_base); 299 ctx->saved_flags = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 300 ctx->addr = page_base; 301 302 if (ctx->probe && ctx->probe->pre_handler) 303 ctx->probe->pre_handler(ctx->probe, regs, addr); 304 305 /* 306 * Enable single-stepping and disable interrupts for the faulting 307 * context. Local interrupts must not get enabled during stepping. 308 */ 309 regs->flags |= X86_EFLAGS_TF; 310 regs->flags &= ~X86_EFLAGS_IF; 311 312 /* Now we set present bit in PTE and single step. */ 313 disarm_kmmio_fault_page(ctx->fpage); 314 315 /* 316 * If another cpu accesses the same page while we are stepping, 317 * the access will not be caught. It will simply succeed and the 318 * only downside is we lose the event. If this becomes a problem, 319 * the user should drop to single cpu before tracing. 320 */ 321 322 return 1; /* fault handled */ 323 324 no_kmmio: 325 rcu_read_unlock_sched_notrace(); 326 return ret; 327 } 328 329 /* 330 * Interrupts are disabled on entry as trap1 is an interrupt gate 331 * and they remain disabled throughout this function. 332 * This must always get called as the pair to kmmio_handler(). 333 */ 334 static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 335 { 336 int ret = 0; 337 struct kmmio_context *ctx = this_cpu_ptr(&kmmio_ctx); 338 339 if (!ctx->active) { 340 /* 341 * debug traps without an active context are due to either 342 * something external causing them (f.e. using a debugger while 343 * mmio tracing enabled), or erroneous behaviour 344 */ 345 pr_warn("unexpected debug trap on CPU %d.\n", smp_processor_id()); 346 goto out; 347 } 348 349 if (ctx->probe && ctx->probe->post_handler) 350 ctx->probe->post_handler(ctx->probe, condition, regs); 351 352 /* Prevent racing against release_kmmio_fault_page(). */ 353 arch_spin_lock(&kmmio_lock); 354 if (ctx->fpage->count) 355 arm_kmmio_fault_page(ctx->fpage); 356 arch_spin_unlock(&kmmio_lock); 357 358 regs->flags &= ~X86_EFLAGS_TF; 359 regs->flags |= ctx->saved_flags; 360 361 /* These were acquired in kmmio_handler(). */ 362 ctx->active--; 363 BUG_ON(ctx->active); 364 rcu_read_unlock_sched_notrace(); 365 366 /* 367 * if somebody else is singlestepping across a probe point, flags 368 * will have TF set, in which case, continue the remaining processing 369 * of do_debug, as if this is not a probe hit. 370 */ 371 if (!(regs->flags & X86_EFLAGS_TF)) 372 ret = 1; 373 out: 374 return ret; 375 } 376 377 /* You must be holding kmmio_lock. */ 378 static int add_kmmio_fault_page(unsigned long addr) 379 { 380 struct kmmio_fault_page *f; 381 382 f = get_kmmio_fault_page(addr); 383 if (f) { 384 if (!f->count) 385 arm_kmmio_fault_page(f); 386 f->count++; 387 return 0; 388 } 389 390 f = kzalloc(sizeof(*f), GFP_ATOMIC); 391 if (!f) 392 return -1; 393 394 f->count = 1; 395 f->addr = addr; 396 397 if (arm_kmmio_fault_page(f)) { 398 kfree(f); 399 return -1; 400 } 401 402 list_add_rcu(&f->list, kmmio_page_list(f->addr)); 403 404 return 0; 405 } 406 407 /* You must be holding kmmio_lock. */ 408 static void release_kmmio_fault_page(unsigned long addr, 409 struct kmmio_fault_page **release_list) 410 { 411 struct kmmio_fault_page *f; 412 413 f = get_kmmio_fault_page(addr); 414 if (!f) 415 return; 416 417 f->count--; 418 BUG_ON(f->count < 0); 419 if (!f->count) { 420 disarm_kmmio_fault_page(f); 421 if (!f->scheduled_for_release) { 422 f->release_next = *release_list; 423 *release_list = f; 424 f->scheduled_for_release = true; 425 } 426 } 427 } 428 429 /* 430 * With page-unaligned ioremaps, one or two armed pages may contain 431 * addresses from outside the intended mapping. Events for these addresses 432 * are currently silently dropped. The events may result only from programming 433 * mistakes by accessing addresses before the beginning or past the end of a 434 * mapping. 435 */ 436 int register_kmmio_probe(struct kmmio_probe *p) 437 { 438 unsigned long flags; 439 int ret = 0; 440 unsigned long size = 0; 441 unsigned long addr = p->addr & PAGE_MASK; 442 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 443 unsigned int l; 444 pte_t *pte; 445 446 local_irq_save(flags); 447 arch_spin_lock(&kmmio_lock); 448 if (get_kmmio_probe(addr)) { 449 ret = -EEXIST; 450 goto out; 451 } 452 453 pte = lookup_address(addr, &l); 454 if (!pte) { 455 ret = -EINVAL; 456 goto out; 457 } 458 459 kmmio_count++; 460 list_add_rcu(&p->list, &kmmio_probes); 461 while (size < size_lim) { 462 if (add_kmmio_fault_page(addr + size)) 463 pr_err("Unable to set page fault.\n"); 464 size += page_level_size(l); 465 } 466 out: 467 arch_spin_unlock(&kmmio_lock); 468 local_irq_restore(flags); 469 470 /* 471 * XXX: What should I do here? 472 * Here was a call to global_flush_tlb(), but it does not exist 473 * anymore. It seems it's not needed after all. 474 */ 475 return ret; 476 } 477 EXPORT_SYMBOL(register_kmmio_probe); 478 479 static void rcu_free_kmmio_fault_pages(struct rcu_head *head) 480 { 481 struct kmmio_delayed_release *dr = container_of( 482 head, 483 struct kmmio_delayed_release, 484 rcu); 485 struct kmmio_fault_page *f = dr->release_list; 486 while (f) { 487 struct kmmio_fault_page *next = f->release_next; 488 BUG_ON(f->count); 489 kfree(f); 490 f = next; 491 } 492 kfree(dr); 493 } 494 495 static void remove_kmmio_fault_pages(struct rcu_head *head) 496 { 497 struct kmmio_delayed_release *dr = 498 container_of(head, struct kmmio_delayed_release, rcu); 499 struct kmmio_fault_page *f = dr->release_list; 500 struct kmmio_fault_page **prevp = &dr->release_list; 501 unsigned long flags; 502 503 local_irq_save(flags); 504 arch_spin_lock(&kmmio_lock); 505 while (f) { 506 if (!f->count) { 507 list_del_rcu(&f->list); 508 prevp = &f->release_next; 509 } else { 510 *prevp = f->release_next; 511 f->release_next = NULL; 512 f->scheduled_for_release = false; 513 } 514 f = *prevp; 515 } 516 arch_spin_unlock(&kmmio_lock); 517 local_irq_restore(flags); 518 519 /* This is the real RCU destroy call. */ 520 call_rcu(&dr->rcu, rcu_free_kmmio_fault_pages); 521 } 522 523 /* 524 * Remove a kmmio probe. You have to synchronize_rcu() before you can be 525 * sure that the callbacks will not be called anymore. Only after that 526 * you may actually release your struct kmmio_probe. 527 * 528 * Unregistering a kmmio fault page has three steps: 529 * 1. release_kmmio_fault_page() 530 * Disarm the page, wait a grace period to let all faults finish. 531 * 2. remove_kmmio_fault_pages() 532 * Remove the pages from kmmio_page_table. 533 * 3. rcu_free_kmmio_fault_pages() 534 * Actually free the kmmio_fault_page structs as with RCU. 535 */ 536 void unregister_kmmio_probe(struct kmmio_probe *p) 537 { 538 unsigned long flags; 539 unsigned long size = 0; 540 unsigned long addr = p->addr & PAGE_MASK; 541 const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK); 542 struct kmmio_fault_page *release_list = NULL; 543 struct kmmio_delayed_release *drelease; 544 unsigned int l; 545 pte_t *pte; 546 547 pte = lookup_address(addr, &l); 548 if (!pte) 549 return; 550 551 local_irq_save(flags); 552 arch_spin_lock(&kmmio_lock); 553 while (size < size_lim) { 554 release_kmmio_fault_page(addr + size, &release_list); 555 size += page_level_size(l); 556 } 557 list_del_rcu(&p->list); 558 kmmio_count--; 559 arch_spin_unlock(&kmmio_lock); 560 local_irq_restore(flags); 561 562 if (!release_list) 563 return; 564 565 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 566 if (!drelease) { 567 pr_crit("leaking kmmio_fault_page objects.\n"); 568 return; 569 } 570 drelease->release_list = release_list; 571 572 /* 573 * This is not really RCU here. We have just disarmed a set of 574 * pages so that they cannot trigger page faults anymore. However, 575 * we cannot remove the pages from kmmio_page_table, 576 * because a probe hit might be in flight on another CPU. The 577 * pages are collected into a list, and they will be removed from 578 * kmmio_page_table when it is certain that no probe hit related to 579 * these pages can be in flight. RCU grace period sounds like a 580 * good choice. 581 * 582 * If we removed the pages too early, kmmio page fault handler might 583 * not find the respective kmmio_fault_page and determine it's not 584 * a kmmio fault, when it actually is. This would lead to madness. 585 */ 586 call_rcu(&drelease->rcu, remove_kmmio_fault_pages); 587 } 588 EXPORT_SYMBOL(unregister_kmmio_probe); 589 590 static int 591 kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) 592 { 593 struct die_args *arg = args; 594 unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err); 595 596 if (val == DIE_DEBUG && (*dr6_p & DR_STEP)) 597 if (post_kmmio_handler(*dr6_p, arg->regs) == 1) { 598 /* 599 * Reset the BS bit in dr6 (pointed by args->err) to 600 * denote completion of processing 601 */ 602 *dr6_p &= ~DR_STEP; 603 return NOTIFY_STOP; 604 } 605 606 return NOTIFY_DONE; 607 } 608 609 static struct notifier_block nb_die = { 610 .notifier_call = kmmio_die_notifier 611 }; 612 613 int kmmio_init(void) 614 { 615 int i; 616 617 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) 618 INIT_LIST_HEAD(&kmmio_page_table[i]); 619 620 return register_die_notifier(&nb_die); 621 } 622 623 void kmmio_cleanup(void) 624 { 625 int i; 626 627 unregister_die_notifier(&nb_die); 628 for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { 629 WARN_ONCE(!list_empty(&kmmio_page_table[i]), 630 KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); 631 } 632 } 633
Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.