1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PREEMPT_H #define __LINUX_PREEMPT_H /* * include/linux/preempt.h - macros for accessing and manipulating * preempt_count (used for kernel preemption, interrupt count, etc.) */ #include <linux/linkage.h> #include <linux/list.h> /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 #define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional * value so that it also works on !PREEMPT_COUNT kernels. * * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* * Initial preempt_count value; reflects the preempt_count schedule invariant * which states that during context switches: * * preempt_count() == 2*PREEMPT_DISABLE_OFFSET * * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. * Note: See finish_task_switch(). */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ | NMI_MASK)) /* * Are we doing bottom half or hardware interrupt processing? * * in_irq() - We're in (hard) IRQ context * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled * in_serving_softirq() - We're in softirq context * in_nmi() - We're in NMI context * in_task() - We're in task context * * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really * should not be used in new code. */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) #define in_nmi() (preempt_count() & NMI_MASK) #define in_task() (!(preempt_count() & \ (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) /* * The preempt_count offset after preempt_disable(); */ #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET #else # define PREEMPT_DISABLE_OFFSET 0 #endif /* * The preempt_count offset after spin_lock() */ #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET /* * The preempt_count offset needed for things like: * * spin_lock_bh() * * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and * softirqs, such that unlock sequences of: * * spin_unlock(); * local_bh_enable(); * * Work as expected. */ #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about * held spinlocks in non-preemptible kernels. Thus it should not be * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ #define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ ({ preempt_count_sub(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif #define __preempt_count_inc() __preempt_count_add(1) #define __preempt_count_dec() __preempt_count_sub(1) #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_notrace(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched(0)) \ __preempt_schedule(); \ } while (0) #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #define preempt_check_resched() do { } while (0) #endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ /* * Even if we don't have any preemption, we need preempt disable/enable * to be barriers, so that we don't have things like get_user/put_user * that can cause faults and scheduling migrate into our preempt-protected * region. */ #define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() #define preempt_enable_no_resched() barrier() #define preempt_enable() barrier() #define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() #define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE /* * Modules have no business playing preemption tricks. */ #undef sched_preempt_enable_no_resched #undef preempt_enable_no_resched #undef preempt_enable_no_resched_notrace #undef preempt_check_resched #endif #define preempt_set_need_resched() \ do { \ set_preempt_need_resched(); \ } while (0) #define preempt_fold_need_resched() \ do { \ if (tif_need_resched()) \ set_preempt_need_resched(); \ } while (0) #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; /** * preempt_ops - notifiers called when a task is preempted and rescheduled * @sched_in: we're about to be rescheduled: * notifier: struct preempt_notifier for the task being scheduled * cpu: cpu we're scheduled on * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out * * Please note that sched_in and out are called under different * contexts. sched_out is called with rq lock held and irq disabled * while sched_in is called without rq lock and irq enabled. This * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next); }; /** * preempt_notifier - key for installing preemption notifiers * @link: internal use * @ops: defines the notifier functions to be called * * Usually used in conjunction with container_of(). */ struct preempt_notifier { struct hlist_node link; struct preempt_ops *ops; }; void preempt_notifier_inc(void); void preempt_notifier_dec(void); void preempt_notifier_register(struct preempt_notifier *notifier); void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { INIT_HLIST_NODE(&notifier->link); notifier->ops = ops; } #endif /** * migrate_disable - Prevent migration of the current task * * Maps to preempt_disable() which also disables preemption. Use * migrate_disable() to annotate that the intent is to prevent migration, * but not necessarily preemption. * * Can be invoked nested like preempt_disable() and needs the corresponding * number of migrate_enable() invocations. */ static __always_inline void migrate_disable(void) { preempt_disable(); } /** * migrate_enable - Allow migration of the current task * * Counterpart to migrate_disable(). * * As migrate_disable() can be invoked nested, only the outermost invocation * reenables migration. * * Currently mapped to preempt_enable(). */ static __always_inline void migrate_enable(void) { preempt_enable(); } #endif /* __LINUX_PREEMPT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PARAVIRT_H #define _ASM_X86_PARAVIRT_H /* Various instructions on x86 need to be replaced for * para-virtualization: those hooks are defined here. */ #ifdef CONFIG_PARAVIRT #include <asm/pgtable_types.h> #include <asm/asm.h> #include <asm/nospec-branch.h> #include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ #include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> #include <asm/frame.h> static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, time.sched_clock); } struct static_key; extern struct static_key paravirt_steal_enabled; extern struct static_key paravirt_steal_rq_enabled; __visible void __native_queued_spin_unlock(struct qspinlock *lock); bool pv_is_native_spin_unlock(void); __visible bool __native_vcpu_is_preempted(long cpu); bool pv_is_native_vcpu_is_preempted(void); static inline u64 paravirt_steal_clock(int cpu) { return PVOP_CALL1(u64, time.steal_clock, cpu); } /* The paravirtualized I/O functions */ static inline void slow_down_io(void) { pv_ops.cpu.io_delay(); #ifdef REALLY_SLOW_IO pv_ops.cpu.io_delay(); pv_ops.cpu.io_delay(); pv_ops.cpu.io_delay(); #endif } void native_flush_tlb_local(void); void native_flush_tlb_global(void); void native_flush_tlb_one_user(unsigned long addr); void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); static inline void __flush_tlb_local(void) { PVOP_VCALL0(mmu.flush_tlb_user); } static inline void __flush_tlb_global(void) { PVOP_VCALL0(mmu.flush_tlb_kernel); } static inline void __flush_tlb_one_user(unsigned long addr) { PVOP_VCALL1(mmu.flush_tlb_one_user, addr); } static inline void __flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info); } static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) { PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); } static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { PVOP_VCALL1(mmu.exit_mmap, mm); } #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { PVOP_VCALL1(cpu.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ static inline void __cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { PVOP_VCALL4(cpu.cpuid, eax, ebx, ecx, edx); } /* * These special macros can be used to get or set a debugging register */ static inline unsigned long paravirt_get_debugreg(int reg) { return PVOP_CALL1(unsigned long, cpu.get_debugreg, reg); } #define get_debugreg(var, reg) var = paravirt_get_debugreg(reg) static inline void set_debugreg(unsigned long val, int reg) { PVOP_VCALL2(cpu.set_debugreg, reg, val); } static inline unsigned long read_cr0(void) { return PVOP_CALL0(unsigned long, cpu.read_cr0); } static inline void write_cr0(unsigned long x) { PVOP_VCALL1(cpu.write_cr0, x); } static inline unsigned long read_cr2(void) { return PVOP_CALLEE0(unsigned long, mmu.read_cr2); } static inline void write_cr2(unsigned long x) { PVOP_VCALL1(mmu.write_cr2, x); } static inline unsigned long __read_cr3(void) { return PVOP_CALL0(unsigned long, mmu.read_cr3); } static inline void write_cr3(unsigned long x) { PVOP_VCALL1(mmu.write_cr3, x); } static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(cpu.write_cr4, x); } static inline void arch_safe_halt(void) { PVOP_VCALL0(irq.safe_halt); } static inline void halt(void) { PVOP_VCALL0(irq.halt); } static inline void wbinvd(void) { PVOP_VCALL0(cpu.wbinvd); } static inline u64 paravirt_read_msr(unsigned msr) { return PVOP_CALL1(u64, cpu.read_msr, msr); } static inline void paravirt_write_msr(unsigned msr, unsigned low, unsigned high) { PVOP_VCALL3(cpu.write_msr, msr, low, high); } static inline u64 paravirt_read_msr_safe(unsigned msr, int *err) { return PVOP_CALL2(u64, cpu.read_msr_safe, msr, err); } static inline int paravirt_write_msr_safe(unsigned msr, unsigned low, unsigned high) { return PVOP_CALL3(int, cpu.write_msr_safe, msr, low, high); } #define rdmsr(msr, val1, val2) \ do { \ u64 _l = paravirt_read_msr(msr); \ val1 = (u32)_l; \ val2 = _l >> 32; \ } while (0) #define wrmsr(msr, val1, val2) \ do { \ paravirt_write_msr(msr, val1, val2); \ } while (0) #define rdmsrl(msr, val) \ do { \ val = paravirt_read_msr(msr); \ } while (0) static inline void wrmsrl(unsigned msr, u64 val) { wrmsr(msr, (u32)val, (u32)(val>>32)); } #define wrmsr_safe(msr, a, b) paravirt_write_msr_safe(msr, a, b) /* rdmsr with exception handling */ #define rdmsr_safe(msr, a, b) \ ({ \ int _err; \ u64 _l = paravirt_read_msr_safe(msr, &_err); \ (*a) = (u32)_l; \ (*b) = _l >> 32; \ _err; \ }) static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) { int err; *p = paravirt_read_msr_safe(msr, &err); return err; } static inline unsigned long long paravirt_read_pmc(int counter) { return PVOP_CALL1(u64, cpu.read_pmc, counter); } #define rdpmc(counter, low, high) \ do { \ u64 _l = paravirt_read_pmc(counter); \ low = (u32)_l; \ high = _l >> 32; \ } while (0) #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.alloc_ldt, ldt, entries); } static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(cpu.free_ldt, ldt, entries); } static inline void load_TR_desc(void) { PVOP_VCALL0(cpu.load_tr_desc); } static inline void load_gdt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_gdt, dtr); } static inline void load_idt(const struct desc_ptr *dtr) { PVOP_VCALL1(cpu.load_idt, dtr); } static inline void set_ldt(const void *addr, unsigned entries) { PVOP_VCALL2(cpu.set_ldt, addr, entries); } static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, cpu.store_tr); } #define store_tr(tr) ((tr) = paravirt_store_tr()) static inline void load_TLS(struct thread_struct *t, unsigned cpu) { PVOP_VCALL2(cpu.load_tls, t, cpu); } static inline void load_gs_index(unsigned int gs) { PVOP_VCALL1(cpu.load_gs_index, gs); } static inline void write_ldt_entry(struct desc_struct *dt, int entry, const void *desc) { PVOP_VCALL3(cpu.write_ldt_entry, dt, entry, desc); } static inline void write_gdt_entry(struct desc_struct *dt, int entry, void *desc, int type) { PVOP_VCALL4(cpu.write_gdt_entry, dt, entry, desc, type); } static inline void write_idt_entry(gate_desc *dt, int entry, const gate_desc *g) { PVOP_VCALL3(cpu.write_idt_entry, dt, entry, g); } #ifdef CONFIG_X86_IOPL_IOPERM static inline void tss_invalidate_io_bitmap(void) { PVOP_VCALL0(cpu.invalidate_io_bitmap); } static inline void tss_update_io_bitmap(void) { PVOP_VCALL0(cpu.update_io_bitmap); } #endif static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) { PVOP_VCALL2(mmu.activate_mm, prev, next); } static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { PVOP_VCALL2(mmu.dup_mmap, oldmm, mm); } static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, mmu.pgd_alloc, mm); } static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) { PVOP_VCALL2(mmu.pgd_free, mm, pgd); } static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pte, mm, pfn); } static inline void paravirt_release_pte(unsigned long pfn) { PVOP_VCALL1(mmu.release_pte, pfn); } static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pmd, mm, pfn); } static inline void paravirt_release_pmd(unsigned long pfn) { PVOP_VCALL1(mmu.release_pmd, pfn); } static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_pud, mm, pfn); } static inline void paravirt_release_pud(unsigned long pfn) { PVOP_VCALL1(mmu.release_pud, pfn); } static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) { PVOP_VCALL2(mmu.alloc_p4d, mm, pfn); } static inline void paravirt_release_p4d(unsigned long pfn) { PVOP_VCALL1(mmu.release_p4d, pfn); } static inline pte_t __pte(pteval_t val) { return (pte_t) { PVOP_CALLEE1(pteval_t, mmu.make_pte, val) }; } static inline pteval_t pte_val(pte_t pte) { return PVOP_CALLEE1(pteval_t, mmu.pte_val, pte.pte); } static inline pgd_t __pgd(pgdval_t val) { return (pgd_t) { PVOP_CALLEE1(pgdval_t, mmu.make_pgd, val) }; } static inline pgdval_t pgd_val(pgd_t pgd) { return PVOP_CALLEE1(pgdval_t, mmu.pgd_val, pgd.pgd); } #define __HAVE_ARCH_PTEP_MODIFY_PROT_TRANSACTION static inline pte_t ptep_modify_prot_start(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { pteval_t ret; ret = PVOP_CALL3(pteval_t, mmu.ptep_modify_prot_start, vma, addr, ptep); return (pte_t) { .pte = ret }; } static inline void ptep_modify_prot_commit(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t old_pte, pte_t pte) { PVOP_VCALL4(mmu.ptep_modify_prot_commit, vma, addr, ptep, pte.pte); } static inline void set_pte(pte_t *ptep, pte_t pte) { PVOP_VCALL2(mmu.set_pte, ptep, pte.pte); } static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { PVOP_VCALL2(mmu.set_pmd, pmdp, native_pmd_val(pmd)); } static inline pmd_t __pmd(pmdval_t val) { return (pmd_t) { PVOP_CALLEE1(pmdval_t, mmu.make_pmd, val) }; } static inline pmdval_t pmd_val(pmd_t pmd) { return PVOP_CALLEE1(pmdval_t, mmu.pmd_val, pmd.pmd); } static inline void set_pud(pud_t *pudp, pud_t pud) { PVOP_VCALL2(mmu.set_pud, pudp, native_pud_val(pud)); } static inline pud_t __pud(pudval_t val) { pudval_t ret; ret = PVOP_CALLEE1(pudval_t, mmu.make_pud, val); return (pud_t) { ret }; } static inline pudval_t pud_val(pud_t pud) { return PVOP_CALLEE1(pudval_t, mmu.pud_val, pud.pud); } static inline void pud_clear(pud_t *pudp) { set_pud(pudp, native_make_pud(0)); } static inline void set_p4d(p4d_t *p4dp, p4d_t p4d) { p4dval_t val = native_p4d_val(p4d); PVOP_VCALL2(mmu.set_p4d, p4dp, val); } #if CONFIG_PGTABLE_LEVELS >= 5 static inline p4d_t __p4d(p4dval_t val) { p4dval_t ret = PVOP_CALLEE1(p4dval_t, mmu.make_p4d, val); return (p4d_t) { ret }; } static inline p4dval_t p4d_val(p4d_t p4d) { return PVOP_CALLEE1(p4dval_t, mmu.p4d_val, p4d.p4d); } static inline void __set_pgd(pgd_t *pgdp, pgd_t pgd) { PVOP_VCALL2(mmu.set_pgd, pgdp, native_pgd_val(pgd)); } #define set_pgd(pgdp, pgdval) do { \ if (pgtable_l5_enabled()) \ __set_pgd(pgdp, pgdval); \ else \ set_p4d((p4d_t *)(pgdp), (p4d_t) { (pgdval).pgd }); \ } while (0) #define pgd_clear(pgdp) do { \ if (pgtable_l5_enabled()) \ set_pgd(pgdp, native_make_pgd(0)); \ } while (0) #endif /* CONFIG_PGTABLE_LEVELS == 5 */ static inline void p4d_clear(p4d_t *p4dp) { set_p4d(p4dp, native_make_p4d(0)); } static inline void set_pte_atomic(pte_t *ptep, pte_t pte) { set_pte(ptep, pte); } static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { set_pte(ptep, native_make_pte(0)); } static inline void pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, native_make_pmd(0)); } #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(struct task_struct *prev) { PVOP_VCALL1(cpu.start_context_switch, prev); } static inline void arch_end_context_switch(struct task_struct *next) { PVOP_VCALL1(cpu.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.enter); } static inline void arch_leave_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.leave); } static inline void arch_flush_lazy_mmu_mode(void) { PVOP_VCALL0(mmu.lazy_mode.flush); } static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, phys_addr_t phys, pgprot_t flags) { pv_ops.mmu.set_fixmap(idx, phys, flags); } #endif #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { PVOP_VCALL2(lock.queued_spin_lock_slowpath, lock, val); } static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) { PVOP_VCALLEE1(lock.queued_spin_unlock, lock); } static __always_inline void pv_wait(u8 *ptr, u8 val) { PVOP_VCALL2(lock.wait, ptr, val); } static __always_inline void pv_kick(int cpu) { PVOP_VCALL1(lock.kick, cpu); } static __always_inline bool pv_vcpu_is_preempted(long cpu) { return PVOP_CALLEE1(bool, lock.vcpu_is_preempted, cpu); } void __raw_callee_save___native_queued_spin_unlock(struct qspinlock *lock); bool __raw_callee_save___native_vcpu_is_preempted(long cpu); #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS "pushl %ecx;" #define PV_RESTORE_ALL_CALLER_REGS "popl %ecx;" #else /* save and restore all caller-save registers, except return value */ #define PV_SAVE_ALL_CALLER_REGS \ "push %rcx;" \ "push %rdx;" \ "push %rsi;" \ "push %rdi;" \ "push %r8;" \ "push %r9;" \ "push %r10;" \ "push %r11;" #define PV_RESTORE_ALL_CALLER_REGS \ "pop %r11;" \ "pop %r10;" \ "pop %r9;" \ "pop %r8;" \ "pop %rdi;" \ "pop %rsi;" \ "pop %rdx;" \ "pop %rcx;" #endif /* * Generate a thunk around a function which saves all caller-save * registers except for the return value. This allows C functions to * be called from assembler code where fewer than normal registers are * available. It may also help code generation around calls from C * code if the common case doesn't use many registers. * * When a callee is wrapped in a thunk, the caller can assume that all * arg regs and all scratch registers are preserved across the * call. The return value in rax/eax will not be saved, even for void * functions. */ #define PV_THUNK_NAME(func) "__raw_callee_save_" #func #define PV_CALLEE_SAVE_REGS_THUNK(func) \ extern typeof(func) __raw_callee_save_##func; \ \ asm(".pushsection .text;" \ ".globl " PV_THUNK_NAME(func) ";" \ ".type " PV_THUNK_NAME(func) ", @function;" \ PV_THUNK_NAME(func) ":" \ FRAME_BEGIN \ PV_SAVE_ALL_CALLER_REGS \ "call " #func ";" \ PV_RESTORE_ALL_CALLER_REGS \ FRAME_END \ "ret;" \ ".size " PV_THUNK_NAME(func) ", .-" PV_THUNK_NAME(func) ";" \ ".popsection") /* Get a reference to a callee-save function */ #define PV_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { __raw_callee_save_##func }) /* Promise that "func" already uses the right calling convention */ #define __PV_IS_CALLEE_SAVE(func) \ ((struct paravirt_callee_save) { func }) #ifdef CONFIG_PARAVIRT_XXL static inline notrace unsigned long arch_local_save_flags(void) { return PVOP_CALLEE0(unsigned long, irq.save_fl); } static inline notrace void arch_local_irq_restore(unsigned long f) { PVOP_VCALLEE1(irq.restore_fl, f); } static inline notrace void arch_local_irq_disable(void) { PVOP_VCALLEE0(irq.irq_disable); } static inline notrace void arch_local_irq_enable(void) { PVOP_VCALLEE0(irq.irq_enable); } static inline notrace unsigned long arch_local_irq_save(void) { unsigned long f; f = arch_local_save_flags(); arch_local_irq_disable(); return f; } #endif /* Make sure as little as possible of this mess escapes. */ #undef PARAVIRT_CALL #undef __PVOP_CALL #undef __PVOP_VCALL #undef PVOP_VCALL0 #undef PVOP_CALL0 #undef PVOP_VCALL1 #undef PVOP_CALL1 #undef PVOP_VCALL2 #undef PVOP_CALL2 #undef PVOP_VCALL3 #undef PVOP_CALL3 #undef PVOP_VCALL4 #undef PVOP_CALL4 extern void default_banner(void); #else /* __ASSEMBLY__ */ #define _PVSITE(ptype, ops, word, algn) \ 771:; \ ops; \ 772:; \ .pushsection .parainstructions,"a"; \ .align algn; \ word 771b; \ .byte ptype; \ .byte 772b-771b; \ .popsection #define COND_PUSH(set, mask, reg) \ .if ((~(set)) & mask); push %reg; .endif #define COND_POP(set, mask, reg) \ .if ((~(set)) & mask); pop %reg; .endif #ifdef CONFIG_X86_64 #define PV_SAVE_REGS(set) \ COND_PUSH(set, CLBR_RAX, rax); \ COND_PUSH(set, CLBR_RCX, rcx); \ COND_PUSH(set, CLBR_RDX, rdx); \ COND_PUSH(set, CLBR_RSI, rsi); \ COND_PUSH(set, CLBR_RDI, rdi); \ COND_PUSH(set, CLBR_R8, r8); \ COND_PUSH(set, CLBR_R9, r9); \ COND_PUSH(set, CLBR_R10, r10); \ COND_PUSH(set, CLBR_R11, r11) #define PV_RESTORE_REGS(set) \ COND_POP(set, CLBR_R11, r11); \ COND_POP(set, CLBR_R10, r10); \ COND_POP(set, CLBR_R9, r9); \ COND_POP(set, CLBR_R8, r8); \ COND_POP(set, CLBR_RDI, rdi); \ COND_POP(set, CLBR_RSI, rsi); \ COND_POP(set, CLBR_RDX, rdx); \ COND_POP(set, CLBR_RCX, rcx); \ COND_POP(set, CLBR_RAX, rax) #define PARA_PATCH(off) ((off) / 8) #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .quad, 8) #define PARA_INDIRECT(addr) *addr(%rip) #else #define PV_SAVE_REGS(set) \ COND_PUSH(set, CLBR_EAX, eax); \ COND_PUSH(set, CLBR_EDI, edi); \ COND_PUSH(set, CLBR_ECX, ecx); \ COND_PUSH(set, CLBR_EDX, edx) #define PV_RESTORE_REGS(set) \ COND_POP(set, CLBR_EDX, edx); \ COND_POP(set, CLBR_ECX, ecx); \ COND_POP(set, CLBR_EDI, edi); \ COND_POP(set, CLBR_EAX, eax) #define PARA_PATCH(off) ((off) / 4) #define PARA_SITE(ptype, ops) _PVSITE(ptype, ops, .long, 4) #define PARA_INDIRECT(addr) *%cs:addr #endif #ifdef CONFIG_PARAVIRT_XXL #define INTERRUPT_RETURN \ PARA_SITE(PARA_PATCH(PV_CPU_iret), \ ANNOTATE_RETPOLINE_SAFE; \ jmp PARA_INDIRECT(pv_ops+PV_CPU_iret);) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(PV_IRQ_irq_disable), \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_IRQ_irq_disable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(PV_IRQ_irq_enable), \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #endif #ifdef CONFIG_X86_64 #ifdef CONFIG_PARAVIRT_XXL /* * If swapgs is used while the userspace stack is still current, * there's no way to call a pvop. The PV replacement *must* be * inlined, or the swapgs instruction must be trapped and emulated. */ #define SWAPGS_UNSAFE_STACK \ PARA_SITE(PARA_PATCH(PV_CPU_swapgs), swapgs) /* * Note: swapgs is very special, and in practise is either going to be * implemented with a single "swapgs" instruction or something very * special. Either way, we don't need to save any registers for * it. */ #define SWAPGS \ PARA_SITE(PARA_PATCH(PV_CPU_swapgs), \ ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ ) #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ ANNOTATE_RETPOLINE_SAFE; \ jmp PARA_INDIRECT(pv_ops+PV_CPU_usergs_sysret64);) #ifdef CONFIG_DEBUG_ENTRY #define SAVE_FLAGS(clobbers) \ PARA_SITE(PARA_PATCH(PV_IRQ_save_fl), \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #endif #endif /* CONFIG_PARAVIRT_XXL */ #endif /* CONFIG_X86_64 */ #ifdef CONFIG_PARAVIRT_XXL #define GET_CR2_INTO_AX \ PARA_SITE(PARA_PATCH(PV_MMU_read_cr2), \ ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); \ ) #endif /* CONFIG_PARAVIRT_XXL */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ # define default_banner x86_init_noop #endif /* !CONFIG_PARAVIRT */ #ifndef __ASSEMBLY__ #ifndef CONFIG_PARAVIRT_XXL static inline void paravirt_arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { } #endif #ifndef CONFIG_PARAVIRT static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) { } #endif #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PARAVIRT_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 // SPDX-License-Identifier: GPL-2.0 #include <linux/memblock.h> #include <linux/mmdebug.h> #include <linux/export.h> #include <linux/mm.h> #include <asm/page.h> #include <linux/vmalloc.h> #include "physaddr.h" #ifdef CONFIG_X86_64 #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { x = y + phys_base; VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { x = y + (__START_KERNEL_map - PAGE_OFFSET); /* carry flag will be set if starting x was >= PAGE_OFFSET */ VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } return x; } EXPORT_SYMBOL(__phys_addr); unsigned long __phys_addr_symbol(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* only check upper bounds since lower bounds will trigger carry */ VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); return y + phys_base; } EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) { unsigned long y = x - __START_KERNEL_map; /* use the carry flag to determine if x was < __START_KERNEL_map */ if (unlikely(x > y)) { x = y + phys_base; if (y >= KERNEL_IMAGE_SIZE) return false; } else { x = y + (__START_KERNEL_map - PAGE_OFFSET); /* carry flag will be set if starting x was >= PAGE_OFFSET */ if ((x > y) || !phys_addr_valid(x)) return false; } return pfn_valid(x >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); #else #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); /* max_low_pfn is set early, but not _that_ early */ if (max_low_pfn) { VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); } return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif bool __virt_addr_valid(unsigned long x) { if (x < PAGE_OFFSET) return false; if (__vmalloc_start_set && is_vmalloc_addr((void *) x)) return false; if (x >= FIXADDR_START) return false; return pfn_valid((x - PAGE_OFFSET) >> PAGE_SHIFT); } EXPORT_SYMBOL(__virt_addr_valid); #endif /* CONFIG_X86_64 */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NODEMASK_H #define __LINUX_NODEMASK_H /* * Nodemasks provide a bitmap suitable for representing the * set of Node's in a system, one bit position per Node number. * * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * * For details of nodemask_parse_user(), see bitmap_parse_user() in * lib/bitmap.c. For details of nodelist_parse(), see bitmap_parselist(), * also in bitmap.c. For details of node_remap(), see bitmap_bitremap in * lib/bitmap.c. For details of nodes_remap(), see bitmap_remap in * lib/bitmap.c. For details of nodes_onto(), see bitmap_onto in * lib/bitmap.c. For details of nodes_fold(), see bitmap_fold in * lib/bitmap.c. * * The available nodemask operations are: * * void node_set(node, mask) turn on bit 'node' in mask * void node_clear(node, mask) turn off bit 'node' in mask * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask * int node_test_and_set(node, mask) test and set bit 'node' in mask * * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * * int nodes_equal(mask1, mask2) Does mask1 == mask2? * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? * int nodes_empty(mask) Is mask empty (no bits sets)? * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * * int first_node(mask) Number lowest set bit, or MAX_NUMNODES * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask * int nodelist_parse(buf, map) Parse ascii string as nodelist * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src) * void nodes_onto(dst, orig, relmap) *dst = orig relative to relmap * void nodes_fold(dst, orig, sz) dst bits = orig bits mod sz * * for_each_node_mask(node, mask) for-loop node over mask * * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * * int node_random(mask) Random node with set bit in mask * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * * for_each_node(node) for-loop node over node_possible_map * for_each_online_node(node) for-loop node over node_online_map * * Subtlety: * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. * * NODEMASK_SCRATCH * When doing above logical AND, OR, XOR, Remap operations the callers tend to * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper * for such situations. See below and CPUMASK_ALLOC also. */ #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/numa.h> typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; /** * nodemask_pr_args - printf args to output a nodemask * @maskp: nodemask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions * are called in both __init and non-__init functions, if they are not * inlined we will end up with a section mis-match error (of the type of * freeable items not being freed). So we must use __always_inline here * to fix the problem. If other functions in the future also end up in * this situation they will also need to be annotated as __always_inline */ #define node_set(node, dst) __node_set((node), &(dst)) static __always_inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } #define node_clear(node, dst) __node_clear((node), &(dst)) static inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } /* No static inline type checking - see Subtlety (1) above. */ #define node_isset(node, nodemask) test_bit((node), (nodemask).bits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) static inline int __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); } #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) static inline int __first_node(const nodemask_t *srcp) { return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) static inline int __next_node(int n, const nodemask_t *srcp) { return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* * Find the next present node in src, starting after node n, wrapping around to * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) int __next_node_in(int node, const nodemask_t *srcp); static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); } #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL << (node); \ } else { \ init_nodemask_of_node(&m, (node)); \ } \ m; \ }) #define first_unset_node(mask) __first_unset_node(&(mask)) static inline int __first_unset_node(const nodemask_t *maskp) { return min_t(int,MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG #define NODE_MASK_ALL \ ((nodemask_t) { { \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #else #define NODE_MASK_ALL \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #endif #define NODE_MASK_NONE \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ } }) #define nodes_addr(src) ((src).bits) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) static inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) static inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); } #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); } #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); } #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); } #if MAX_NUMNODES > 1 #define for_each_node_mask(node, mask) \ for ((node) = first_node(mask); \ (node) < MAX_NUMNODES; \ (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ #define for_each_node_mask(node, mask) \ if (!nodes_empty(mask)) \ for ((node) = 0; (node) < 1; (node)++) #endif /* MAX_NUMNODES */ /* * Bitmasks that are kept for all the nodes. */ enum node_states { N_POSSIBLE, /* The node could become online at some point */ N_ONLINE, /* The node is online */ N_NORMAL_MEMORY, /* The node has regular memory */ #ifdef CONFIG_HIGHMEM N_HIGH_MEMORY, /* The node has regular or high memory */ #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; /* * The following particular system nodemasks and operations * on them manage all possible and online nodes. */ extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 static inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } static inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } static inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } static inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } #define for_each_node_state(__node, __state) \ for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) static inline int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } static inline int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; static inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } static inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } #else static inline int node_state(int node, enum node_states state) { return node == 0; } static inline void node_set_state(int node, enum node_states state) { } static inline void node_clear_state(int node, enum node_states state) { } static inline int num_node_state(enum node_states state) { return 1; } #define for_each_node_state(node, __state) \ for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) extern int node_random(const nodemask_t *maskp); #else static inline int node_random(const nodemask_t *mask) { return 0; } #endif #define node_online_map node_states[N_ONLINE] #define node_possible_map node_states[N_POSSIBLE] #define num_online_nodes() num_node_state(N_ONLINE) #define num_possible_nodes() num_node_state(N_POSSIBLE) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) /* * For nodemask scrach area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ #if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) #else #define NODEMASK_ALLOC(type, name, gfp_flags) type _##name, *name = &_##name #define NODEMASK_FREE(m) do {} while (0) #endif /* A example struture for using NODEMASK_ALLOC, used in mempolicy. */ struct nodemask_scratch { nodemask_t mask1; nodemask_t mask2; }; #define NODEMASK_SCRATCH(x) \ NODEMASK_ALLOC(struct nodemask_scratch, x, \ GFP_KERNEL | __GFP_NORETRY) #define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) #endif /* __LINUX_NODEMASK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2009-2019 Christoph Hellwig * * NOTE: none of these tracepoints shall be consider a stable kernel ABI * as they can change at any time. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap #if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _IOMAP_TRACE_H #include <linux/tracepoint.h> struct inode; DECLARE_EVENT_CLASS(iomap_readpage_class, TP_PROTO(struct inode *inode, int nr_pages), TP_ARGS(inode, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(int, nr_pages) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_pages = nr_pages; ), TP_printk("dev %d:%d ino 0x%llx nr_pages %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->nr_pages) ) #define DEFINE_READPAGE_EVENT(name) \ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) __field(unsigned long, offset) __field(unsigned int, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->size = i_size_read(inode); __entry->offset = off; __entry->length = len; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx " "length %x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->length) ) #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_releasepage); DEFINE_RANGE_EVENT(iomap_invalidatepage); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ { IOMAP_DELALLOC, "DELALLOC" }, \ { IOMAP_MAPPED, "MAPPED" }, \ { IOMAP_UNWRITTEN, "UNWRITTEN" }, \ { IOMAP_INLINE, "INLINE" } #define IOMAP_FLAGS_STRINGS \ { IOMAP_WRITE, "WRITE" }, \ { IOMAP_ZERO, "ZERO" }, \ { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ { IOMAP_F_DIRTY, "DIRTY" }, \ { IOMAP_F_SHARED, "SHARED" }, \ { IOMAP_F_MERGED, "MERGED" }, \ { IOMAP_F_BUFFER_HEAD, "BH" }, \ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), TP_ARGS(inode, iomap), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(u64, addr) __field(loff_t, offset) __field(u64, length) __field(u16, type) __field(u16, flags) __field(dev_t, bdev) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->addr = iomap->addr; __entry->offset = iomap->offset; __entry->length = iomap->length; __entry->type = iomap->type; __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld " "length %llu type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), __entry->addr, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) ) #define DEFINE_IOMAP_EVENT(name) \ DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_apply_dstmap); DEFINE_IOMAP_EVENT(iomap_apply_srcmap); TRACE_EVENT(iomap_apply, TP_PROTO(struct inode *inode, loff_t pos, loff_t length, unsigned int flags, const void *ops, void *actor, unsigned long caller), TP_ARGS(inode, pos, length, flags, ops, actor, caller), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) __field(loff_t, length) __field(unsigned int, flags) __field(const void *, ops) __field(void *, actor) __field(unsigned long, caller) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->length = length; __entry->flags = flags; __entry->ops = ops; __entry->actor = actor; __entry->caller = caller; ), TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) " "ops %ps caller %pS actor %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, (void *)__entry->caller, __entry->actor) ); #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM percpu #if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PERCPU_H #include <linux/tracepoint.h> TRACE_EVENT(percpu_alloc_percpu, TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align, void *base_addr, int off, void __percpu *ptr), TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr), TP_STRUCT__entry( __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) ), TP_fast_assign( __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; ), TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p", __entry->reserved, __entry->is_atomic, __entry->size, __entry->align, __entry->base_addr, __entry->off, __entry->ptr) ); TRACE_EVENT(percpu_free_percpu, TP_PROTO(void *base_addr, int off, void __percpu *ptr), TP_ARGS(base_addr, off, ptr), TP_STRUCT__entry( __field( void *, base_addr ) __field( int, off ) __field( void __percpu *, ptr ) ), TP_fast_assign( __entry->base_addr = base_addr; __entry->off = off; __entry->ptr = ptr; ), TP_printk("base_addr=%p off=%d ptr=%p", __entry->base_addr, __entry->off, __entry->ptr) ); TRACE_EVENT(percpu_alloc_percpu_fail, TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align), TP_ARGS(reserved, is_atomic, size, align), TP_STRUCT__entry( __field( bool, reserved ) __field( bool, is_atomic ) __field( size_t, size ) __field( size_t, align ) ), TP_fast_assign( __entry->reserved = reserved; __entry->is_atomic = is_atomic; __entry->size = size; __entry->align = align; ), TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu", __entry->reserved, __entry->is_atomic, __entry->size, __entry->align) ); TRACE_EVENT(percpu_create_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); TRACE_EVENT(percpu_destroy_chunk, TP_PROTO(void *base_addr), TP_ARGS(base_addr), TP_STRUCT__entry( __field( void *, base_addr ) ), TP_fast_assign( __entry->base_addr = base_addr; ), TP_printk("base_addr=%p", __entry->base_addr) ); #endif /* _TRACE_PERCPU_H */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CPUHOTPLUG_H #define __CPUHOTPLUG_H #include <linux/types.h> /* * CPU-up CPU-down * * BP AP BP AP * * OFFLINE OFFLINE * | ^ * v | * BRINGUP_CPU->AP_OFFLINE BRINGUP_CPU <- AP_IDLE_DEAD (idle thread/play_dead) * | AP_OFFLINE * v (IRQ-off) ,---------------^ * AP_ONLNE | (stop_machine) * | TEARDOWN_CPU <- AP_ONLINE_IDLE * | ^ * v | * AP_ACTIVE AP_ACTIVE */ enum cpuhp_state { CPUHP_INVALID = -1, CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, CPUHP_PERF_X86_AMD_UNCORE_PREP, CPUHP_PERF_POWER, CPUHP_PERF_SUPERH, CPUHP_X86_HPET_DEAD, CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, CPUHP_MM_VMSTAT_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, CPUHP_CPUIDLE_DEAD, CPUHP_ARM64_FPSIMD_DEAD, CPUHP_ARM_OMAP_WAKE_DEAD, CPUHP_IRQ_POLL_DEAD, CPUHP_BLOCK_SOFTIRQ_DEAD, CPUHP_ACPI_CPUDRV_DEAD, CPUHP_S390_PFAULT_DEAD, CPUHP_BLK_MQ_DEAD, CPUHP_FS_BUFF_DEAD, CPUHP_PRINTK_DEAD, CPUHP_MM_MEMCQ_DEAD, CPUHP_PERCPU_CNT_DEAD, CPUHP_RADIX_DEAD, CPUHP_PAGE_ALLOC_DEAD, CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, CPUHP_XEN_PREPARE, CPUHP_XEN_EVTCHN_PREPARE, CPUHP_ARM_SHMOBILE_SCU_PREPARE, CPUHP_SH_SH3X_PREPARE, CPUHP_NET_FLOW_PREPARE, CPUHP_TOPOLOGY_PREPARE, CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, CPUHP_MM_ZSWP_MEM_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_SCHED_STARTING, CPUHP_AP_RCUTREE_DYING, CPUHP_AP_CPU_PM_STARTING, CPUHP_AP_IRQ_GIC_STARTING, CPUHP_AP_IRQ_HIP04_STARTING, CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, CPUHP_AP_IRQ_RISCV_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, CPUHP_AP_MICROCODE_LOADER, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, CPUHP_AP_PERF_X86_CQM_STARTING, CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, CPUHP_AP_MIPS_OP_LOONGSON3_STARTING, CPUHP_AP_ARM_SDEI_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING, CPUHP_AP_PERF_ARM_ACPI_STARTING, CPUHP_AP_PERF_ARM_STARTING, CPUHP_AP_ARM_L2X0_STARTING, CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING, CPUHP_AP_ARM_ARCH_TIMER_STARTING, CPUHP_AP_ARM_GLOBAL_TIMER_STARTING, CPUHP_AP_JCORE_TIMER_STARTING, CPUHP_AP_ARM_TWD_STARTING, CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, CPUHP_AP_MARCO_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, CPUHP_AP_CLINT_TIMER_STARTING, CPUHP_AP_CSKY_TIMER_STARTING, CPUHP_AP_TI_GP_TIMER_STARTING, CPUHP_AP_HYPERV_TIMER_STARTING, CPUHP_AP_KVM_STARTING, CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, CPUHP_AP_KVM_ARM_VGIC_STARTING, CPUHP_AP_KVM_ARM_TIMER_STARTING, /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, CPUHP_AP_X86_TBOOT_DYING, CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_BLK_MQ_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, CPUHP_AP_PERF_X86_RAPL_ONLINE, CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, CPUHP_AP_PERF_ARM_L2X0_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE, CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE, CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE, CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE, CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); /** * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks * @state: The state for which the calls are installed * @name: Name of the callback (will be used in debug output) * @startup: startup callback function * @teardown: teardown callback function * * Installs the callback functions and invokes the startup callback on * the present cpus which have already reached the @state. */ static inline int cpuhp_setup_state(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state(state, name, true, startup, teardown, false); } static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state_cpuslocked(state, name, true, startup, teardown, false); } /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the * callbacks * @state: The state for which the calls are installed * @name: Name of the callback. * @startup: startup callback function * @teardown: teardown callback function * * Same as @cpuhp_setup_state except that no calls are executed are invoked * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. */ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state(state, name, false, startup, teardown, false); } static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state_cpuslocked(state, name, false, startup, teardown, false); } /** * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed * @name: Name of the callback. * @startup: startup callback function * @teardown: teardown callback function * * Sets the internal multi_instance flag and prepares a state to work as a multi * instance callback. No callbacks are invoked at this point. The callbacks are * invoked once an instance for this state are registered via * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls. */ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu, struct hlist_node *node), int (*teardown)(unsigned int cpu, struct hlist_node *node)) { return __cpuhp_setup_state(state, name, false, (void *) startup, (void *) teardown, true); } int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, struct hlist_node *node, bool invoke); /** * cpuhp_state_add_instance - Add an instance for a state and invoke startup * callback. * @state: The state for which the instance is installed * @node: The node for this individual state. * * Installs the instance for the @state and invokes the startup callback on * the present cpus which have already reached the @state. The @state must have * been earlier marked as multi-instance by @cpuhp_setup_state_multi. */ static inline int cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_add_instance(state, node, true); } /** * cpuhp_state_add_instance_nocalls - Add an instance for a state without * invoking the startup callback. * @state: The state for which the instance is installed * @node: The node for this individual state. * * Installs the instance for the @state The @state must have been earlier * marked as multi-instance by @cpuhp_setup_state_multi. */ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_add_instance(state, node, false); } static inline int cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_add_instance_cpuslocked(state, node, false); } void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); /** * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown * @state: The state for which the calls are removed * * Removes the callback functions and invokes the teardown callback on * the present cpus which have already reached the @state. */ static inline void cpuhp_remove_state(enum cpuhp_state state) { __cpuhp_remove_state(state, true); } /** * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking * teardown * @state: The state for which the calls are removed */ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) { __cpuhp_remove_state(state, false); } static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) { __cpuhp_remove_state_cpuslocked(state, false); } /** * cpuhp_remove_multi_state - Remove hotplug multi state callback * @state: The state for which the calls are removed * * Removes the callback functions from a multi state. This is the reverse of * cpuhp_setup_state_multi(). All instances should have been removed before * invoking this function. */ static inline void cpuhp_remove_multi_state(enum cpuhp_state state) { __cpuhp_remove_state(state, false); } int __cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); /** * cpuhp_state_remove_instance - Remove hotplug instance from state and invoke * the teardown callback * @state: The state from which the instance is removed * @node: The node for this individual state. * * Removes the instance and invokes the teardown callback on the present cpus * which have already reached the @state. */ static inline int cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_remove_instance(state, node, true); } /** * cpuhp_state_remove_instance_nocalls - Remove hotplug instance from state * without invoking the reatdown callback * @state: The state from which the instance is removed * @node: The node for this individual state. * * Removes the instance without invoking the teardown callback. */ static inline int cpuhp_state_remove_instance_nocalls(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_remove_instance(state, node, false); } #ifdef CONFIG_SMP void cpuhp_online_idle(enum cpuhp_state state); #else static inline void cpuhp_online_idle(enum cpuhp_state state) { } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 /* SPDX-License-Identifier: GPL-2.0 */ /* * NFS internal definitions */ #include "nfs4_fs.h" #include <linux/fs_context.h> #include <linux/security.h> #include <linux/crc32.h> #include <linux/sunrpc/addr.h> #include <linux/nfs_page.h> #include <linux/wait_bit.h> #define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; struct nfs_string; struct nfs_pageio_descriptor; static inline void nfs_attr_check_mountpoint(struct super_block *parent, struct nfs_fattr *fattr) { if (!nfs_fsid_equal(&NFS_SB(parent)->fsid, &fattr->fsid)) fattr->valid |= NFS_ATTR_FATTR_MOUNTPOINT; } static inline int nfs_attr_use_mounted_on_fileid(struct nfs_fattr *fattr) { if (((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) == 0) || (((fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT) == 0) && ((fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) == 0))) return 0; return 1; } static inline bool nfs_lookup_is_soft_revalidate(const struct dentry *dentry) { if (!(NFS_SB(dentry->d_sb)->flags & NFS_MOUNT_SOFTREVAL)) return false; if (!d_is_positive(dentry) || !NFS_FH(d_inode(dentry))->size) return false; return true; } /* * Note: RFC 1813 doesn't limit the number of auth flavors that * a server can return, so make something up. */ #define NFS_MAX_SECFLAVORS (12) /* * Value used if the user did not specify a port value. */ #define NFS_UNSPEC_PORT (-1) #define NFS_UNSPEC_RETRANS (UINT_MAX) #define NFS_UNSPEC_TIMEO (UINT_MAX) /* * Maximum number of pages that readdir can use for creating * a vmapped array of pages. */ #define NFS_MAX_READDIR_PAGES 8 struct nfs_client_initdata { unsigned long init_flags; const char *hostname; /* Hostname of the server */ const struct sockaddr *addr; /* Address of the server */ const char *nodename; /* Hostname of the client */ const char *ip_addr; /* IP address of the client */ size_t addrlen; struct nfs_subversion *nfs_mod; int proto; u32 minorversion; unsigned int nconnect; struct net *net; const struct rpc_timeout *timeparms; const struct cred *cred; }; /* * In-kernel mount arguments */ struct nfs_fs_context { bool internal; bool skip_reconfig_option_check; bool need_mount; bool sloppy; unsigned int flags; /* NFS{,4}_MOUNT_* flags */ unsigned int rsize, wsize; unsigned int timeo, retrans; unsigned int acregmin, acregmax; unsigned int acdirmin, acdirmax; unsigned int namlen; unsigned int options; unsigned int bsize; struct nfs_auth_info auth_info; rpc_authflavor_t selected_flavor; char *client_address; unsigned int version; unsigned int minorversion; char *fscache_uniq; unsigned short protofamily; unsigned short mountfamily; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; u32 version; int port; unsigned short protocol; } mount_server; struct { union { struct sockaddr address; struct sockaddr_storage _address; }; size_t addrlen; char *hostname; char *export_path; int port; unsigned short protocol; unsigned short nconnect; unsigned short export_path_len; } nfs_server; struct nfs_fh *mntfh; struct nfs_server *server; struct nfs_subversion *nfs_mod; /* Information for a cloned mount. */ struct nfs_clone_mount { struct super_block *sb; struct dentry *dentry; struct nfs_fattr *fattr; unsigned int inherited_bsize; } clone_data; }; #define nfs_errorf(fc, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_ferrorf(fc, fac, fmt, ...) ((fc)->log.log ? \ errorf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) #define nfs_invalf(fc, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_finvalf(fc, fac, fmt, ...) ((fc)->log.log ? \ invalf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); -EINVAL; })) #define nfs_warnf(fc, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dprintk(fmt "\n", ## __VA_ARGS__); })) #define nfs_fwarnf(fc, fac, fmt, ...) ((fc)->log.log ? \ warnf(fc, fmt, ## __VA_ARGS__) : \ ({ dfprintk(fac, fmt "\n", ## __VA_ARGS__); })) static inline struct nfs_fs_context *nfs_fc2context(const struct fs_context *fc) { return fc->fs_private; } /* mount_clnt.c */ struct nfs_mount_request { struct sockaddr *sap; size_t salen; char *hostname; char *dirpath; u32 version; unsigned short protocol; struct nfs_fh *fh; int noresvport; unsigned int *auth_flav_len; rpc_authflavor_t *auth_flavs; struct net *net; }; extern int nfs_mount(struct nfs_mount_request *info); extern void nfs_umount(const struct nfs_mount_request *info); /* client.c */ extern const struct rpc_program nfs_program; extern void nfs_clients_init(struct net *net); extern void nfs_clients_exit(struct net *net); extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *); int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t); struct nfs_client *nfs_get_client(const struct nfs_client_initdata *); int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *); void nfs_server_insert_lists(struct nfs_server *); void nfs_server_remove_lists(struct nfs_server *); void nfs_init_timeout_values(struct rpc_timeout *to, int proto, int timeo, int retrans); int nfs_init_server_rpcclient(struct nfs_server *, const struct rpc_timeout *t, rpc_authflavor_t); struct nfs_server *nfs_alloc_server(void); void nfs_server_copy_userdata(struct nfs_server *, struct nfs_server *); extern void nfs_put_client(struct nfs_client *); extern void nfs_free_client(struct nfs_client *); extern struct nfs_client *nfs4_find_client_ident(struct net *, int); extern struct nfs_client * nfs4_find_client_sessionid(struct net *, const struct sockaddr *, struct nfs4_sessionid *, u32); extern struct nfs_server *nfs_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_server(struct fs_context *); extern struct nfs_server *nfs4_create_referral_server(struct fs_context *); extern int nfs4_update_server(struct nfs_server *server, const char *hostname, struct sockaddr *sap, size_t salen, struct net *net); extern void nfs_free_server(struct nfs_server *server); extern struct nfs_server *nfs_clone_server(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, rpc_authflavor_t); extern bool nfs_client_init_is_complete(const struct nfs_client *clp); extern int nfs_client_init_status(const struct nfs_client *clp); extern int nfs_wait_client_init_complete(const struct nfs_client *clp); extern void nfs_mark_client_ready(struct nfs_client *clp, int state); extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans, u32 minor_version); extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *, struct inode *); extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, const struct sockaddr *ds_addr, int ds_addrlen, int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans); #ifdef CONFIG_PROC_FS extern int __init nfs_fs_proc_init(void); extern void nfs_fs_proc_exit(void); extern int nfs_fs_proc_net_init(struct net *net); extern void nfs_fs_proc_net_exit(struct net *net); #else static inline int nfs_fs_proc_net_init(struct net *net) { return 0; } static inline void nfs_fs_proc_net_exit(struct net *net) { } static inline int nfs_fs_proc_init(void) { return 0; } static inline void nfs_fs_proc_exit(void) { } #endif /* callback_xdr.c */ extern const struct svc_version nfs4_callback_version1; extern const struct svc_version nfs4_callback_version4; /* fs_context.c */ extern struct file_system_type nfs_fs_type; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); extern void nfs_destroy_nfspagecache(void); extern int __init nfs_init_readpagecache(void); extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); extern void nfs_destroy_writepagecache(void); extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, const struct cred *cred, const struct nfs_rpc_ops *rpc_ops, const struct rpc_call_ops *call_ops, int how, int flags); void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, const struct nfs_open_context *ctx2) { return cred_fscmp(ctx1->cred, ctx2->cred) == 0 && ctx1->state == ctx2->state; } /* nfs2xdr.c */ extern const struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs3xdr.c */ extern const struct rpc_procinfo nfs3_procedures[]; extern int nfs3_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); /* nfs4xdr.c */ #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_decode_dirent(struct xdr_stream *, struct nfs_entry *, bool); #endif #ifdef CONFIG_NFS_V4_1 extern const u32 nfs41_maxread_overhead; extern const u32 nfs41_maxwrite_overhead; extern const u32 nfs41_maxgetdevinfo_overhead; #endif /* nfs4proc.c */ #if IS_ENABLED(CONFIG_NFS_V4) extern const struct rpc_procinfo nfs4_procedures[]; #endif #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { if (!dst || !src) return NULL; if (src->len > NFS4_MAXLABELLEN) return NULL; dst->lfs = src->lfs; dst->pi = src->pi; dst->len = src->len; memcpy(dst->label, src->label, src->len); return dst; } static inline void nfs4_label_free(struct nfs4_label *label) { if (label) { kfree(label->label); kfree(label); } return; } static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { if (nfs_server_capable(&nfsi->vfs_inode, NFS_CAP_SECURITY_LABEL)) nfsi->cache_validity |= NFS_INO_INVALID_LABEL; } #else static inline struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { return NULL; } static inline void nfs4_label_free(void *label) {} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } static inline struct nfs4_label * nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) { return NULL; } #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync); extern struct nfs_client *nfs_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); /* dir.c */ extern void nfs_advise_use_readdirplus(struct inode *dir); extern void nfs_force_use_readdirplus(struct inode *dir); extern unsigned long nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc); extern unsigned long nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc); struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); int nfs_create(struct inode *, struct dentry *, umode_t, bool); int nfs_mkdir(struct inode *, struct dentry *, umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct inode *, struct dentry *, const char *); int nfs_link(struct dentry *, struct inode *, struct dentry *); int nfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); int nfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); /* file.c */ int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync); loff_t nfs_file_llseek(struct file *, loff_t, int); ssize_t nfs_file_read(struct kiocb *, struct iov_iter *); int nfs_file_mmap(struct file *, struct vm_area_struct *); ssize_t nfs_file_write(struct kiocb *, struct iov_iter *); int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_free_inode(struct inode *); extern int nfs_write_inode(struct inode *, struct writeback_control *); extern int nfs_drop_inode(struct inode *); extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); extern bool nfs_check_cache_invalid(struct inode *, unsigned long); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); extern int nfs_wait_atomic_killable(atomic_t *p, unsigned int mode); /* super.c */ extern const struct super_operations nfs_sops; bool nfs_auth_info_match(const struct nfs_auth_info *, rpc_authflavor_t); int nfs_try_get_tree(struct fs_context *); int nfs_get_tree_common(struct fs_context *); void nfs_kill_super(struct super_block *); extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); extern bool nfs_sb_active(struct super_block *sb); extern void nfs_sb_deactive(struct super_block *sb); extern int nfs_client_for_each_server(struct nfs_client *clp, int (*fn)(struct nfs_server *, void *), void *data); /* io.c */ extern void nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); extern void nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); extern void nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) { return test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0; } /* namespace.c */ #define NFS_PATH_CANONICAL 1 extern char *nfs_path(char **p, struct dentry *dentry, char *buffer, ssize_t buflen, unsigned flags); extern struct vfsmount *nfs_d_automount(struct path *path); int nfs_submount(struct fs_context *, struct nfs_server *); int nfs_do_submount(struct fs_context *); /* getroot.c */ extern int nfs_get_root(struct super_block *s, struct fs_context *fc); #if IS_ENABLED(CONFIG_NFS_V4) extern int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool); #endif struct nfs_pgio_completion_ops; /* read.c */ extern void nfs_pageio_init_read(struct nfs_pageio_descriptor *pgio, struct inode *inode, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_read_prepare(struct rpc_task *task, void *calldata); extern void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio); /* super.c */ void nfs_umount_begin(struct super_block *); int nfs_statfs(struct dentry *, struct kstatfs *); int nfs_show_options(struct seq_file *, struct dentry *); int nfs_show_devname(struct seq_file *, struct dentry *); int nfs_show_path(struct seq_file *, struct dentry *); int nfs_show_stats(struct seq_file *, struct dentry *); int nfs_reconfigure(struct fs_context *); /* write.c */ extern void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, struct inode *inode, int ioflags, bool force_mds, const struct nfs_pgio_completion_ops *compl_ops); extern void nfs_pageio_reset_write_mds(struct nfs_pageio_descriptor *pgio); extern void nfs_commit_free(struct nfs_commit_data *p); extern void nfs_write_prepare(struct rpc_task *task, void *calldata); extern void nfs_commit_prepare(struct rpc_task *task, void *calldata); extern int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, const struct nfs_rpc_ops *nfs_ops, const struct rpc_call_ops *call_ops, int how, int flags); extern void nfs_init_commit(struct nfs_commit_data *data, struct list_head *head, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo); int nfs_scan_commit_list(struct list_head *src, struct list_head *dst, struct nfs_commit_info *cinfo, int max); unsigned long nfs_reqs_to_commit(struct nfs_commit_info *); int nfs_scan_commit(struct inode *inode, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); int nfs_write_need_commit(struct nfs_pgio_header *); void nfs_writeback_update_inode(struct nfs_pgio_header *hdr); int nfs_generic_commit_list(struct inode *inode, struct list_head *head, int how, struct nfs_commit_info *cinfo); void nfs_retry_commit(struct list_head *page_list, struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, struct nfs_commit_info *cinfo); void nfs_request_remove_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_init_cinfo(struct nfs_commit_info *cinfo, struct inode *inode, struct nfs_direct_req *dreq); int nfs_key_timeout_notify(struct file *filp, struct inode *inode); bool nfs_ctx_key_to_expire(struct nfs_open_context *ctx, struct inode *inode); void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio); int nfs_filemap_write_and_wait_range(struct address_space *mapping, loff_t lstart, loff_t lend); #ifdef CONFIG_NFS_V4_1 static inline void pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets, unsigned int nbuckets) { unsigned int i; for (i = 0; i < nbuckets; i++) buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW; } static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { struct pnfs_commit_array *array; rcu_read_lock(); list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list) pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets, array->nbuckets); rcu_read_unlock(); } #else static inline void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo) { } #endif #ifdef CONFIG_MIGRATION extern int nfs_migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); #endif static inline int nfs_write_verifier_cmp(const struct nfs_write_verifier *v1, const struct nfs_write_verifier *v2) { return memcmp(v1->data, v2->data, sizeof(v1->data)); } static inline bool nfs_write_match_verf(const struct nfs_writeverf *verf, struct nfs_page *req) { return verf->committed > NFS_UNSTABLE && !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier); } /* unlink.c */ extern struct rpc_task * nfs_async_rename(struct inode *old_dir, struct inode *new_dir, struct dentry *old_dentry, struct dentry *new_dentry, void (*complete)(struct rpc_task *, struct nfs_renamedata *)); extern int nfs_sillyrename(struct inode *dir, struct dentry *dentry); /* direct.c */ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo, struct nfs_direct_req *dreq); extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern struct nfs_client *nfs4_init_client(struct nfs_client *clp, const struct nfs_client_initdata *); extern int nfs40_walk_client_list(struct nfs_client *clp, struct nfs_client **result, const struct cred *cred); extern int nfs41_walk_client_list(struct nfs_client *clp, struct nfs_client **result, const struct cred *cred); extern void nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt, void *data); static inline struct inode *nfs_igrab_and_active(struct inode *inode) { struct super_block *sb = inode->i_sb; if (sb && nfs_sb_active(sb)) { if (igrab(inode)) return inode; nfs_sb_deactive(sb); } return NULL; } static inline void nfs_iput_and_deactive(struct inode *inode) { if (inode != NULL) { struct super_block *sb = inode->i_sb; iput(inode); nfs_sb_deactive(sb); } } /* * Determine the device name as a string */ static inline char *nfs_devname(struct dentry *dentry, char *buffer, ssize_t buflen) { char *dummy; return nfs_path(&dummy, dentry, buffer, buflen, NFS_PATH_CANONICAL); } /* * Determine the actual block size (and log2 thereof) */ static inline unsigned long nfs_block_bits(unsigned long bsize, unsigned char *nrbitsp) { /* make sure blocksize is a power of two */ if ((bsize & (bsize - 1)) || nrbitsp) { unsigned char nrbits; for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) ; bsize = 1 << nrbits; if (nrbitsp) *nrbitsp = nrbits; } return bsize; } /* * Calculate the number of 512byte blocks used. */ static inline blkcnt_t nfs_calc_block_size(u64 tsize) { blkcnt_t used = (tsize + 511) >> 9; return (used > ULONG_MAX) ? ULONG_MAX : used; } /* * Compute and set NFS server blocksize */ static inline unsigned long nfs_block_size(unsigned long bsize, unsigned char *nrbitsp) { if (bsize < NFS_MIN_FILE_IO_SIZE) bsize = NFS_DEF_FILE_IO_SIZE; else if (bsize >= NFS_MAX_FILE_IO_SIZE) bsize = NFS_MAX_FILE_IO_SIZE; return nfs_block_bits(bsize, nrbitsp); } /* * Determine the maximum file size for a superblock */ static inline void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) { sb->s_maxbytes = (loff_t)maxfilesize; if (sb->s_maxbytes > MAX_LFS_FILESIZE || sb->s_maxbytes <= 0) sb->s_maxbytes = MAX_LFS_FILESIZE; } /* * Record the page as unstable (an extra writeback period) and mark its * inode as dirty. */ static inline void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) { if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; /* This page is really still in write-back - just that the * writeback is happening on the server now. */ inc_node_page_state(page, NR_WRITEBACK); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } /* * Determine the number of bytes of data the page contains */ static inline unsigned int nfs_page_length(struct page *page) { loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { pgoff_t index = page_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_SHIFT; if (index < end_index) return PAGE_SIZE; if (index == end_index) return ((i_size - 1) & ~PAGE_MASK) + 1; } return 0; } /* * Convert a umode to a dirent->d_type */ static inline unsigned char nfs_umode_to_dtype(umode_t mode) { return (mode >> 12) & 15; } /* * Determine the number of pages in an array of length 'len' and * with a base offset of 'base' */ static inline unsigned int nfs_page_array_len(unsigned int base, size_t len) { return ((unsigned long)len + (unsigned long)base + PAGE_SIZE - 1) >> PAGE_SHIFT; } /* * Convert a struct timespec64 into a 64-bit change attribute * * This does approximately the same thing as timespec64_to_ns(), * but for calculation efficiency, we multiply the seconds by * 1024*1024*1024. */ static inline u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) { return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } #ifdef CONFIG_CRC32 /** * nfs_fhandle_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle * * returns a crc32 hash for the filehandle that is compatible with * the one displayed by "wireshark". */ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); } static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } #else static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return 0; } static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) { return 0; } #endif static inline bool nfs_error_is_fatal(int err) { switch (err) { case -ERESTARTSYS: case -EINTR: case -EACCES: case -EDQUOT: case -EFBIG: case -EIO: case -ENOSPC: case -EROFS: case -ESTALE: case -E2BIG: case -ENOMEM: case -ETIMEDOUT: return true; default: return false; } } static inline bool nfs_error_is_fatal_on_server(int err) { switch (err) { case 0: case -ERESTARTSYS: case -EINTR: return false; } return nfs_error_is_fatal(err); } /* * Select between a default port value and a user-specified port value. * If a zero value is set, then autobind will be used. */ static inline void nfs_set_port(struct sockaddr *sap, int *port, const unsigned short default_port) { if (*port == NFS_UNSPEC_PORT) *port = default_port; rpc_set_port(sap, *port); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_LOCAL_H #define _ASM_X86_LOCAL_H #include <linux/percpu.h> #include <linux/atomic.h> #include <asm/asm.h> typedef struct { atomic_long_t a; } local_t; #define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) } #define local_read(l) atomic_long_read(&(l)->a) #define local_set(l, i) atomic_long_set(&(l)->a, (i)) static inline void local_inc(local_t *l) { asm volatile(_ASM_INC "%0" : "+m" (l->a.counter)); } static inline void local_dec(local_t *l) { asm volatile(_ASM_DEC "%0" : "+m" (l->a.counter)); } static inline void local_add(long i, local_t *l) { asm volatile(_ASM_ADD "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } static inline void local_sub(long i, local_t *l) { asm volatile(_ASM_SUB "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } /** * local_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @l: pointer to type local_t * * Atomically subtracts @i from @l and returns * true if the result is zero, or false for all * other cases. */ static inline bool local_sub_and_test(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i); } /** * local_dec_and_test - decrement and test * @l: pointer to type local_t * * Atomically decrements @l by 1 and * returns true if the result is 0, or false for all other * cases. */ static inline bool local_dec_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e); } /** * local_inc_and_test - increment and test * @l: pointer to type local_t * * Atomically increments @l by 1 * and returns true if the result is zero, or false for all * other cases. */ static inline bool local_inc_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e); } /** * local_add_negative - add and test if negative * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static inline bool local_add_negative(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i); } /** * local_add_return - add and return * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns @i + @l */ static inline long local_add_return(long i, local_t *l) { long __i = i; asm volatile(_ASM_XADD "%0, %1;" : "+r" (i), "+m" (l->a.counter) : : "memory"); return i + __i; } static inline long local_sub_return(long i, local_t *l) { return local_add_return(-i, l); } #define local_inc_return(l) (local_add_return(1, l)) #define local_dec_return(l) (local_sub_return(1, l)) #define local_cmpxchg(l, o, n) \ (cmpxchg_local(&((l)->a.counter), (o), (n))) /* Always has a lock prefix */ #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** * local_add_unless - add unless the number is a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * * Atomically adds @a to @l, so long as it was not @u. * Returns non-zero if @l was not @u, and zero otherwise. */ #define local_add_unless(l, a, u) \ ({ \ long c, old; \ c = local_read((l)); \ for (;;) { \ if (unlikely(c == (u))) \ break; \ old = local_cmpxchg((l), c, c + (a)); \ if (likely(old == c)) \ break; \ c = old; \ } \ c != (u); \ }) #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. * On x86-64 these are better than the atomic variants on SMP kernels * because they dont use a lock prefix. */ #define __local_inc(l) local_inc(l) #define __local_dec(l) local_dec(l) #define __local_add(i, l) local_add((i), (l)) #define __local_sub(i, l) local_sub((i), (l)) #endif /* _ASM_X86_LOCAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in * a new subdirectory with this name. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for each * non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned value * will replace static permissions defined in struct attribute. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if a binary attribute is not visible. The returned * value will replace static permissions defined in * struct bin_attribute. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, struct bin_attribute *, int); struct attribute **attrs; struct bin_attribute **bin_attrs; }; /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define SYSFS_PREALLOC 010000 #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct bin_attribute { struct attribute attr; size_t size; void *private; ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _read, \ .write = _write, \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_WO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .write = _name##_write, \ .size = _size, \ } #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CHECKSUM_64_H #define _ASM_X86_CHECKSUM_64_H /* * Checksums for x86-64 * Copyright 2002 by Andi Kleen, SuSE Labs * with some code from asm-x86/checksum.h */ #include <linux/compiler.h> #include <linux/uaccess.h> #include <asm/byteorder.h> /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum * * Fold a 32bit running checksum to 16bit and invert it. This is usually * the last step before putting a checksum into a packet. * Make sure not to mix with 64bit checksums. */ static inline __sum16 csum_fold(__wsum sum) { asm(" addl %1,%0\n" " adcl $0xffff,%0" : "=r" (sum) : "r" ((__force u32)sum << 16), "0" ((__force u32)sum & 0xffff0000)); return (__force __sum16)(~(__force u32)sum >> 16); } /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. * * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by * Arnt Gulbrandsen. */ /** * ip_fast_csum - Compute the IPv4 header checksum efficiently. * iph: ipv4 header * ihl: length of header / 4 */ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { unsigned int sum; asm(" movl (%1), %0\n" " subl $4, %2\n" " jbe 2f\n" " addl 4(%1), %0\n" " adcl 8(%1), %0\n" " adcl 12(%1), %0\n" "1: adcl 16(%1), %0\n" " lea 4(%1), %1\n" " decl %2\n" " jne 1b\n" " adcl $0, %0\n" " movl %0, %2\n" " shrl $16, %0\n" " addw %w2, %w0\n" " adcl $0, %0\n" " notl %0\n" "2:" /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=r" (sum), "=r" (iph), "=r" (ihl) : "1" (iph), "2" (ihl) : "memory"); return (__force __sum16)sum; } /** * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum. * @saddr: source address * @daddr: destination address * @len: length of packet * @proto: ip protocol of packet * @sum: initial sum to be added in (32bit unfolded) * * Returns the pseudo header checksum the input data. Result is * 32bit unfolded. */ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { asm(" addl %1, %0\n" " adcl %2, %0\n" " adcl %3, %0\n" " adcl $0, %0\n" : "=r" (sum) : "g" (daddr), "g" (saddr), "g" ((len + proto)<<8), "0" (sum)); return sum; } /** * csum_tcpup_magic - Compute an IPv4 pseudo header checksum. * @saddr: source address * @daddr: destination address * @len: length of packet * @proto: ip protocol of packet * @sum: initial sum to be added in (32bit unfolded) * * Returns the 16bit pseudo header checksum the input data already * complemented and ready to be filled in. */ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } /** * csum_partial - Compute an internet checksum. * @buff: buffer to be checksummed * @len: length of buffer. * @sum: initial sum to be added in (32bit unfolded) * * Returns the 32bit unfolded internet checksum of the buffer. * Before filling it in it needs to be csum_fold()'ed. * buff should be aligned to a 64bit boundary if possible. */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len); extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len); extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len); /** * ip_compute_csum - Compute an 16bit IP checksum. * @buff: buffer address. * @len: length of buffer. * * Returns the 16bit folded/inverted checksum of the passed buffer. * Ready to fill in. */ extern __sum16 ip_compute_csum(const void *buff, int len); /** * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header. * @saddr: source address * @daddr: destination address * @len: length of packet * @proto: protocol of packet * @sum: initial sum (32bit unfolded) to be added in * * Computes an IPv6 pseudo header checksum. This sum is added the checksum * into UDP/TCP packets and contains some link layer information. * Returns the unfolded 32bit checksum. */ struct in6_addr; #define _HAVE_ARCH_IPV6_CSUM 1 extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum sum); static inline unsigned add32_with_carry(unsigned a, unsigned b) { asm("addl %2,%0\n\t" "adcl $0,%0" : "=r" (a) : "0" (a), "rm" (b)); return a; } #define HAVE_ARCH_CSUM_ADD static inline __wsum csum_add(__wsum csum, __wsum addend) { return (__force __wsum)add32_with_carry((__force unsigned)csum, (__force unsigned)addend); } #endif /* _ASM_X86_CHECKSUM_64_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Generic internet FLOW. * */ #ifndef _NET_FLOW_H #define _NET_FLOW_H #include <linux/socket.h> #include <linux/in6.h> #include <linux/atomic.h> #include <net/flow_dissector.h> #include <linux/uidgid.h> /* * ifindex generation is per-net namespace, and loopback is * always the 1st device in ns (see net_dev_init), thus any * loopback device should get ifindex 1 */ #define LOOPBACK_IFINDEX 1 struct flowi_tunnel { __be64 tun_id; }; struct flowi_common { int flowic_oif; int flowic_iif; __u32 flowic_mark; __u8 flowic_tos; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; kuid_t flowic_uid; struct flowi_tunnel flowic_tun_key; __u32 flowic_multipath_hash; }; union flowi_uli { struct { __be16 dport; __be16 sport; } ports; struct { __u8 type; __u8 code; } icmpt; struct { __le16 dport; __le16 sport; } dnports; __be32 spi; __be32 gre_key; struct { __u8 type; } mht; }; struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif #define flowi4_mark __fl_common.flowic_mark #define flowi4_tos __fl_common.flowic_tos #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid #define flowi4_multipath_hash __fl_common.flowic_multipath_hash /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; __be32 daddr; union flowi_uli uli; #define fl4_sport uli.ports.sport #define fl4_dport uli.ports.dport #define fl4_icmp_type uli.icmpt.type #define fl4_icmp_code uli.icmpt.code #define fl4_ipsec_spi uli.spi #define fl4_mh_type uli.mht.type #define fl4_gre_key uli.gre_key } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, kuid_t uid) { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_mark = mark; fl4->flowi4_tos = tos; fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = sport; fl4->flowi4_multipath_hash = 0; } /* Reset some input parameters after previous lookup */ static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos, __be32 daddr, __be32 saddr) { fl4->flowi4_oif = oif; fl4->flowi4_tos = tos; fl4->daddr = daddr; fl4->saddr = saddr; } struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport #define fl6_dport uli.ports.dport #define fl6_icmp_type uli.icmpt.type #define fl6_icmp_code uli.icmpt.code #define fl6_ipsec_spi uli.spi #define fl6_mh_type uli.mht.type #define fl6_gre_key uli.gre_key __u32 mp_hash; } __attribute__((__aligned__(BITS_PER_LONG/8))); struct flowidn { struct flowi_common __fl_common; #define flowidn_oif __fl_common.flowic_oif #define flowidn_iif __fl_common.flowic_iif #define flowidn_mark __fl_common.flowic_mark #define flowidn_scope __fl_common.flowic_scope #define flowidn_proto __fl_common.flowic_proto #define flowidn_flags __fl_common.flowic_flags __le16 daddr; __le16 saddr; union flowi_uli uli; #define fld_sport uli.ports.sport #define fld_dport uli.ports.dport } __attribute__((__aligned__(BITS_PER_LONG/8))); struct flowi { union { struct flowi_common __fl_common; struct flowi4 ip4; struct flowi6 ip6; struct flowidn dn; } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) { return container_of(fl4, struct flowi, u.ip4); } static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) { return container_of(fl6, struct flowi, u.ip6); } static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn) { return container_of(fldn, struct flowi, u.dn); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H #include <linux/mm_types.h> #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ /* mm flags */ /* for SUID_DUMP_* above */ #define MMF_DUMPABLE_BITS 2 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things * that are using this for checking for privilege transitions, it must * test against SUID_DUMP_USER rather than treating it as a boolean * value. */ static inline int __get_dumpable(unsigned long mm_flags) { return mm_flags & MMF_DUMPABLE_MASK; } static inline int get_dumpable(struct mm_struct *mm) { return __get_dumpable(mm->flags); } /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 #define MMF_DUMP_MAPPED_PRIVATE 4 #define MMF_DUMP_MAPPED_SHARED 5 #define MMF_DUMP_ELF_HEADERS 6 #define MMF_DUMP_HUGETLB_PRIVATE 7 #define MMF_DUMP_HUGETLB_SHARED 8 #define MMF_DUMP_DAX_PRIVATE 9 #define MMF_DUMP_DAX_SHARED 10 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS # define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ /* * This one-shot flag is dropped due to necessity of changing exe once again * on NFS restore */ //#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 27 /* mm is shared between processes */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TASK_WORK_H #define _LINUX_TASK_WORK_H #include <linux/list.h> #include <linux/sched.h> typedef void (*task_work_func_t)(struct callback_head *); static inline void init_task_work(struct callback_head *twork, task_work_func_t func) { twork->func = func; } enum task_work_notify_mode { TWA_NONE, TWA_RESUME, TWA_SIGNAL, }; int task_work_add(struct task_struct *task, struct callback_head *twork, enum task_work_notify_mode mode); struct callback_head *task_work_cancel(struct task_struct *, task_work_func_t); void task_work_run(void); static inline void exit_task_work(struct task_struct *task) { task_work_run(); } #endif /* _LINUX_TASK_WORK_H */
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 // SPDX-License-Identifier: GPL-2.0-only /* * Simple NUMA memory policy for the Linux kernel. * * Copyright 2003,2004 Andi Kleen, SuSE Labs. * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc. * * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * * Support four policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * * interleave Allocate memory interleaved over a set of nodes, * with normal fallback if it fails. * For VMA based allocations this interleaves based on the * offset into the backing object or offset into the mapping * for anonymous memory. For process policy an process counter * is used. * * bind Only allocate memory on a specific set of nodes, * no fallback. * FIXME: memory is allocated starting with the first node * to the last. It would be better if bind would truly restrict * the allocation to memory nodes instead * * preferred Try a specific node first before normal fallback. * As a special case NUMA_NO_NODE here means do the allocation * on the local CPU. This is normally identical to default, * but useful to set in a VMA when you have a non default * process policy. * * default Allocate on the local node first, or when on a VMA * use the process policy. This is what Linux always did * in a NUMA aware kernel and still does by, ahem, default. * * The process policy is applied for most non interrupt memory allocations * in that process' context. Interrupts ignore the policies and always * try to allocate on the local CPU. The VMA policy is only applied for memory * allocations for a VMA in the VM. * * Currently there are a few corner cases in swapping where the policy * is not applied, but the majority should be handled. When process policy * is used it is not remembered over swap outs/swap ins. * * Only the highest zone in the zone hierarchy gets policied. Allocations * requesting a lower zone just use default policy. This implies that * on systems with highmem kernel lowmem allocation don't get policied. * Same with GFP_DMA allocations. * * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between * all users and remembered even when nobody has memory mapped. */ /* Notebook: fix mmap readahead to honour policy and enable policy for any page cache object statistics for bigpages global policy for page cache? currently it uses process policy. Requires first item above. handle mremap for shared memory (currently ignored for the policy) grows down? make bind policy root only? It can trigger oom much faster and the kernel is not always grateful with that. */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/mempolicy.h> #include <linux/pagewalk.h> #include <linux/highmem.h> #include <linux/hugetlb.h> #include <linux/kernel.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/nodemask.h> #include <linux/cpuset.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/export.h> #include <linux/nsproxy.h> #include <linux/interrupt.h> #include <linux/init.h> #include <linux/compat.h> #include <linux/ptrace.h> #include <linux/swap.h> #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> #include <linux/mm_inline.h> #include <linux/mmu_notifier.h> #include <linux/printk.h> #include <linux/swapops.h> #include <asm/tlbflush.h> #include <linux/uaccess.h> #include "internal.h" /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ static struct kmem_cache *policy_cache; static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ enum zone_type policy_zone = 0; /* * run-time system-wide default policy => local allocation */ static struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .mode = MPOL_PREFERRED, .flags = MPOL_F_LOCAL, }; static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** * numa_map_to_online_node - Find closest online node * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. */ int numa_map_to_online_node(int node) { int min_dist = INT_MAX, dist, n, min_node; if (node == NUMA_NO_NODE || node_online(node)) return node; min_node = node; for_each_online_node(n) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; min_node = n; } } return min_node; } EXPORT_SYMBOL_GPL(numa_map_to_online_node); struct mempolicy *get_task_policy(struct task_struct *p) { struct mempolicy *pol = p->mempolicy; int node; if (pol) return pol; node = numa_node_id(); if (node != NUMA_NO_NODE) { pol = &preferred_node_policy[node]; /* preferred_node_policy is not initialised early in boot */ if (pol->mode) return pol; } return &default_policy; } static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); } mpol_ops[MPOL_MAX]; static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, const nodemask_t *rel) { nodemask_t tmp; nodes_fold(tmp, *orig, nodes_weight(*rel)); nodes_onto(*ret, tmp, *rel); } static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->v.nodes = *nodes; return 0; } static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes) { if (!nodes) pol->flags |= MPOL_F_LOCAL; /* local allocation */ else if (nodes_empty(*nodes)) return -EINVAL; /* no allowed nodes */ else pol->v.preferred_node = first_node(*nodes); return 0; } static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) { if (nodes_empty(*nodes)) return -EINVAL; pol->v.nodes = *nodes; return 0; } /* * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if * any, for the new policy. mpol_new() has already validated the nodes * parameter with respect to the policy mode and flags. But, we need to * handle an empty nodemask with MPOL_PREFERRED here. * * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_lock for write. */ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes, struct nodemask_scratch *nsc) { int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; /* Check N_MEMORY */ nodes_and(nsc->mask1, cpuset_current_mems_allowed, node_states[N_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&nsc->mask2, nodes, &nsc->mask1); else nodes_and(nsc->mask2, *nodes, nsc->mask1); if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else pol->w.cpuset_mems_allowed = cpuset_current_mems_allowed; } if (nodes) ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); else ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } /* * This function just creates a new policy, does some check and simple * initialization. You must invoke mpol_set_nodemask() to set nodes. */ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *policy; pr_debug("setting mode %d flags %d nodes[0] %lx\n", mode, flags, nodes ? nodes_addr(*nodes)[0] : NUMA_NO_NODE); if (mode == MPOL_DEFAULT) { if (nodes && !nodes_empty(*nodes)) return ERR_PTR(-EINVAL); return NULL; } VM_BUG_ON(!nodes); /* * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation). * All other modes require a valid pointer to a non-empty nodemask. */ if (mode == MPOL_PREFERRED) { if (nodes_empty(*nodes)) { if (((flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES))) return ERR_PTR(-EINVAL); } } else if (mode == MPOL_LOCAL) { if (!nodes_empty(*nodes) || (flags & MPOL_F_STATIC_NODES) || (flags & MPOL_F_RELATIVE_NODES)) return ERR_PTR(-EINVAL); mode = MPOL_PREFERRED; } else if (nodes_empty(*nodes)) return ERR_PTR(-EINVAL); policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!policy) return ERR_PTR(-ENOMEM); atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; return policy; } /* Slow path of a mpol destructor. */ void __mpol_put(struct mempolicy *p) { if (!atomic_dec_and_test(&p->refcnt)) return; kmem_cache_free(policy_cache, p); } static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) { } static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) nodes_and(tmp, pol->w.user_nodemask, *nodes); else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { nodes_remap(tmp, pol->v.nodes,pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } if (nodes_empty(tmp)) tmp = *nodes; pol->v.nodes = tmp; } static void mpol_rebind_preferred(struct mempolicy *pol, const nodemask_t *nodes) { nodemask_t tmp; if (pol->flags & MPOL_F_STATIC_NODES) { int node = first_node(pol->w.user_nodemask); if (node_isset(node, *nodes)) { pol->v.preferred_node = node; pol->flags &= ~MPOL_F_LOCAL; } else pol->flags |= MPOL_F_LOCAL; } else if (pol->flags & MPOL_F_RELATIVE_NODES) { mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); pol->v.preferred_node = first_node(tmp); } else if (!(pol->flags & MPOL_F_LOCAL)) { pol->v.preferred_node = node_remap(pol->v.preferred_node, pol->w.cpuset_mems_allowed, *nodes); pol->w.cpuset_mems_allowed = *nodes; } } /* * mpol_rebind_policy - Migrate a policy to a different set of nodes * * Per-vma policies are protected by mmap_lock. Allocations using per-task * policies are protected by task->mems_allowed_seq to prevent a premature * OOM/allocation failure due to parallel nodemask modification. */ static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { if (!pol) return; if (!mpol_store_user_nodemask(pol) && !(pol->flags & MPOL_F_LOCAL) && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; mpol_ops[pol->mode].rebind(pol, newmask); } /* * Wrapper for mpol_rebind_policy() that just requires task * pointer, and updates task mempolicy. * * Called with task's alloc_lock held. */ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { mpol_rebind_policy(tsk->mempolicy, new); } /* * Rebind each vma in mm to new nodemask. * * Call holding a reference to mm. Takes mm->mmap_lock during call. */ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; mmap_write_lock(mm); for (vma = mm->mmap; vma; vma = vma->vm_next) mpol_rebind_policy(vma->vm_policy, new); mmap_write_unlock(mm); } static const struct mempolicy_operations mpol_ops[MPOL_MAX] = { [MPOL_DEFAULT] = { .rebind = mpol_rebind_default, }, [MPOL_INTERLEAVE] = { .create = mpol_new_interleave, .rebind = mpol_rebind_nodemask, }, [MPOL_PREFERRED] = { .create = mpol_new_preferred, .rebind = mpol_rebind_preferred, }, [MPOL_BIND] = { .create = mpol_new_bind, .rebind = mpol_rebind_nodemask, }, }; static int migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); struct queue_pages { struct list_head *pagelist; unsigned long flags; nodemask_t *nmask; unsigned long start; unsigned long end; struct vm_area_struct *first; }; /* * Check if the page's nid is in qp->nmask. * * If MPOL_MF_INVERT is set in qp->flags, check if the nid is * in the invert of qp->nmask. */ static inline bool queue_pages_required(struct page *page, struct queue_pages *qp) { int nid = page_to_nid(page); unsigned long flags = qp->flags; return node_isset(nid, *qp->nmask) == !(flags & MPOL_MF_INVERT); } /* * queue_pages_pmd() has four possible return values: * 0 - pages are placed on the right node or queued successfully. * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * 2 - THP was split. * -EIO - is migration entry or only MPOL_MF_STRICT was specified and an * existing page was already on a node that does not follow the * policy. */ static int queue_pages_pmd(pmd_t *pmd, spinlock_t *ptl, unsigned long addr, unsigned long end, struct mm_walk *walk) __releases(ptl) { int ret = 0; struct page *page; struct queue_pages *qp = walk->private; unsigned long flags; if (unlikely(is_pmd_migration_entry(*pmd))) { ret = -EIO; goto unlock; } page = pmd_page(*pmd); if (is_huge_zero_page(page)) { spin_unlock(ptl); __split_huge_pmd(walk->vma, pmd, addr, false, NULL); ret = 2; goto out; } if (!queue_pages_required(page, qp)) goto unlock; flags = qp->flags; /* go to thp migration */ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { if (!vma_migratable(walk->vma) || migrate_page_add(page, qp->pagelist, flags)) { ret = 1; goto unlock; } } else ret = -EIO; unlock: spin_unlock(ptl); out: return ret; } /* * Scan through pages checking if pages follow certain conditions, * and move them to the pagelist if they do. * * queue_pages_pte_range() has three possible return values: * 0 - pages are placed on the right node or queued successfully. * 1 - there is unmovable page, and MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * -EIO - only MPOL_MF_STRICT was specified and an existing page was already * on a node that does not follow the policy. */ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct page *page; struct queue_pages *qp = walk->private; unsigned long flags = qp->flags; int ret; bool has_unmovable = false; pte_t *pte, *mapped_pte; spinlock_t *ptl; ptl = pmd_trans_huge_lock(pmd, vma); if (ptl) { ret = queue_pages_pmd(pmd, ptl, addr, end, walk); if (ret != 2) return ret; } /* THP was split, fall through to pte walk */ if (pmd_trans_unstable(pmd)) return 0; mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); if (!page) continue; /* * vm_normal_page() filters out zero pages, but there might * still be PageReserved pages to skip, perhaps in a VDSO. */ if (PageReserved(page)) continue; if (!queue_pages_required(page, qp)) continue; if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { /* MPOL_MF_STRICT must be specified if we get here */ if (!vma_migratable(vma)) { has_unmovable = true; break; } /* * Do not abort immediately since there may be * temporary off LRU pages in the range. Still * need migrate other LRU pages. */ if (migrate_page_add(page, qp->pagelist, flags)) has_unmovable = true; } else break; } pte_unmap_unlock(mapped_pte, ptl); cond_resched(); if (has_unmovable) return 1; return addr != end ? -EIO : 0; } static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { int ret = 0; #ifdef CONFIG_HUGETLB_PAGE struct queue_pages *qp = walk->private; unsigned long flags = (qp->flags & MPOL_MF_VALID); struct page *page; spinlock_t *ptl; pte_t entry; ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte); entry = huge_ptep_get(pte); if (!pte_present(entry)) goto unlock; page = pte_page(entry); if (!queue_pages_required(page, qp)) goto unlock; if (flags == MPOL_MF_STRICT) { /* * STRICT alone means only detecting misplaced page and no * need to further check other vma. */ ret = -EIO; goto unlock; } if (!vma_migratable(walk->vma)) { /* * Must be STRICT with MOVE*, otherwise .test_walk() have * stopped walking current vma. * Detecting misplaced page but allow migrating pages which * have been queued. */ ret = 1; goto unlock; } /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ if (flags & (MPOL_MF_MOVE_ALL) || (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { if (!isolate_huge_page(page, qp->pagelist) && (flags & MPOL_MF_STRICT)) /* * Failed to isolate page but allow migrating pages * which have been queued. */ ret = 1; } unlock: spin_unlock(ptl); #else BUG(); #endif return ret; } #ifdef CONFIG_NUMA_BALANCING /* * This is used to mark a range of virtual addresses to be inaccessible. * These are later cleared by a NUMA hinting fault. Depending on these * faults, pages may be migrated for better NUMA placement. * * This is assuming that NUMA faults are handled using PROT_NONE. If * an architecture makes a different choice, it will need further * changes to the core. */ unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { int nr_updated; nr_updated = change_protection(vma, addr, end, PAGE_NONE, MM_CP_PROT_NUMA); if (nr_updated) count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated); return nr_updated; } #else static unsigned long change_prot_numa(struct vm_area_struct *vma, unsigned long addr, unsigned long end) { return 0; } #endif /* CONFIG_NUMA_BALANCING */ static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { struct vm_area_struct *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; /* range check first */ VM_BUG_ON_VMA((vma->vm_start > start) || (vma->vm_end < end), vma); if (!qp->first) { qp->first = vma; if (!(flags & MPOL_MF_DISCONTIG_OK) && (qp->start < vma->vm_start)) /* hole at head side of range */ return -EFAULT; } if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; /* * Need check MPOL_MF_STRICT to return -EIO if possible * regardless of vma_migratable */ if (!vma_migratable(vma) && !(flags & MPOL_MF_STRICT)) return 1; if (endvma > end) endvma = end; if (flags & MPOL_MF_LAZY) { /* Similar to task_numa_work, skip inaccessible VMAs */ if (!is_vm_hugetlb_page(vma) && vma_is_accessible(vma) && !(vma->vm_flags & VM_MIXEDMAP)) change_prot_numa(vma, start, endvma); return 1; } /* queue pages from current vma */ if (flags & MPOL_MF_VALID) return 0; return 1; } static const struct mm_walk_ops queue_pages_walk_ops = { .hugetlb_entry = queue_pages_hugetlb, .pmd_entry = queue_pages_pte_range, .test_walk = queue_pages_test_walk, }; /* * Walk through page tables and collect pages to be migrated. * * If pages found in a given range are on a set of nodes (determined by * @nodes and @flags,) it's isolated and queued to the pagelist which is * passed via @private. * * queue_pages_range() has three possible return values: * 1 - there is unmovable page, but MPOL_MF_MOVE* & MPOL_MF_STRICT were * specified. * 0 - queue pages successfully or no misplaced page. * errno - i.e. misplaced pages with MPOL_MF_STRICT specified (-EIO) or * memory range specified by nodemask and maxnode points outside * your accessible address space (-EFAULT) */ static int queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end, nodemask_t *nodes, unsigned long flags, struct list_head *pagelist) { int err; struct queue_pages qp = { .pagelist = pagelist, .flags = flags, .nmask = nodes, .start = start, .end = end, .first = NULL, }; err = walk_page_range(mm, start, end, &queue_pages_walk_ops, &qp); if (!qp.first) /* whole range in hole */ err = -EFAULT; return err; } /* * Apply policy to a single VMA * This must be called with the mmap_lock held for writing. */ static int vma_replace_policy(struct vm_area_struct *vma, struct mempolicy *pol) { int err; struct mempolicy *old; struct mempolicy *new; pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); new = mpol_dup(pol); if (IS_ERR(new)) return PTR_ERR(new); if (vma->vm_ops && vma->vm_ops->set_policy) { err = vma->vm_ops->set_policy(vma, new); if (err) goto err_out; } old = vma->vm_policy; vma->vm_policy = new; /* protected by mmap_lock */ mpol_put(old); return 0; err_out: mpol_put(new); return err; } /* Step 2: apply policy to a range and do splits. */ static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { struct vm_area_struct *next; struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; unsigned long vmstart; unsigned long vmend; vma = find_vma(mm, start); VM_BUG_ON(!vma); prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); if (mpol_equal(vma_policy(vma), new_pol)) continue; pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, new_pol, vma->vm_userfaultfd_ctx); if (prev) { vma = prev; next = vma->vm_next; if (mpol_equal(vma_policy(vma), new_pol)) continue; /* vma_merge() joined vma && vma->next, case 8 */ goto replace; } if (vma->vm_start != vmstart) { err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) goto out; } if (vma->vm_end != vmend) { err = split_vma(vma->vm_mm, vma, vmend, 0); if (err) goto out; } replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; } out: return err; } /* Set the process memory policy */ static long do_set_mempolicy(unsigned short mode, unsigned short flags, nodemask_t *nodes) { struct mempolicy *new, *old; NODEMASK_SCRATCH(scratch); int ret; if (!scratch) return -ENOMEM; new = mpol_new(mode, flags, nodes); if (IS_ERR(new)) { ret = PTR_ERR(new); goto out; } ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { mpol_put(new); goto out; } task_lock(current); old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) current->il_prev = MAX_NUMNODES-1; task_unlock(current); mpol_put(old); ret = 0; out: NODEMASK_SCRATCH_FREE(scratch); return ret; } /* * Return nodemask for policy for get_mempolicy() query * * Called with task's alloc_lock held */ static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes) { nodes_clear(*nodes); if (p == &default_policy) return; switch (p->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: *nodes = p->v.nodes; break; case MPOL_PREFERRED: if (!(p->flags & MPOL_F_LOCAL)) node_set(p->v.preferred_node, *nodes); /* else return empty node mask for local allocation */ break; default: BUG(); } } static int lookup_node(struct mm_struct *mm, unsigned long addr) { struct page *p = NULL; int err; int locked = 1; err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked); if (err > 0) { err = page_to_nid(p); put_page(p); } if (locked) mmap_read_unlock(mm); return err; } /* Retrieve NUMA policy */ static long do_get_mempolicy(int *policy, nodemask_t *nmask, unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy, *pol_refcount = NULL; if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; if (flags & MPOL_F_MEMS_ALLOWED) { if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; *policy = 0; /* just so it's initialized */ task_lock(current); *nmask = cpuset_current_mems_allowed; task_unlock(current); return 0; } if (flags & MPOL_F_ADDR) { /* * Do NOT fall back to task policy if the * vma/shared policy at addr is NULL. We * want to return MPOL_DEFAULT in this case. */ mmap_read_lock(mm); vma = find_vma_intersection(mm, addr, addr+1); if (!vma) { mmap_read_unlock(mm); return -EFAULT; } if (vma->vm_ops && vma->vm_ops->get_policy) pol = vma->vm_ops->get_policy(vma, addr); else pol = vma->vm_policy; } else if (addr) return -EINVAL; if (!pol) pol = &default_policy; /* indicates default behavior */ if (flags & MPOL_F_NODE) { if (flags & MPOL_F_ADDR) { /* * Take a refcount on the mpol, lookup_node() * wil drop the mmap_lock, so after calling * lookup_node() only "pol" remains valid, "vma" * is stale. */ pol_refcount = pol; vma = NULL; mpol_get(pol); err = lookup_node(mm, addr); if (err < 0) goto out; *policy = err; } else if (pol == current->mempolicy && pol->mode == MPOL_INTERLEAVE) { *policy = next_node_in(current->il_prev, pol->v.nodes); } else { err = -EINVAL; goto out; } } else { *policy = pol == &default_policy ? MPOL_DEFAULT : pol->mode; /* * Internal mempolicy flags must be masked off before exposing * the policy to userspace. */ *policy |= (pol->flags & MPOL_MODE_FLAGS); } err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { *nmask = pol->w.user_nodemask; } else { task_lock(current); get_policy_nodemask(pol, nmask); task_unlock(current); } } out: mpol_cond_put(pol); if (vma) mmap_read_unlock(mm); if (pol_refcount) mpol_put(pol_refcount); return err; } #ifdef CONFIG_MIGRATION /* * page migration, thp tail pages can be passed. */ static int migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { struct page *head = compound_head(page); /* * Avoid migrating a page that is shared with others. */ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(head) == 1) { if (!isolate_lru_page(head)) { list_add_tail(&head->lru, pagelist); mod_node_page_state(page_pgdat(head), NR_ISOLATED_ANON + page_is_file_lru(head), thp_nr_pages(head)); } else if (flags & MPOL_MF_STRICT) { /* * Non-movable page may reach here. And, there may be * temporary off LRU pages or non-LRU movable pages. * Treat them as unmovable pages since they can't be * isolated, so they can't be moved at the moment. It * should return -EIO for this case too. */ return -EIO; } } return 0; } /* * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; struct migration_target_control mtc = { .nid = dest, .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE, }; nodes_clear(nmask); node_set(source, nmask); /* * This does not "check" the range but isolates all pages that * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, alloc_migration_target, NULL, (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); } return err; } /* * Move pages between the two nodesets so as to preserve the physical * layout as much as possible. * * Returns the number of page that could not be moved. */ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { int busy = 0; int err; nodemask_t tmp; err = migrate_prep(); if (err) return err; mmap_read_lock(mm); /* * Find a 'source' bit set in 'tmp' whose corresponding 'dest' * bit in 'to' is not also set in 'tmp'. Clear the found 'source' * bit in 'tmp', and return that <source, dest> pair for migration. * The pair of nodemasks 'to' and 'from' define the map. * * If no pair of bits is found that way, fallback to picking some * pair of 'source' and 'dest' bits that are not the same. If the * 'source' and 'dest' bits are the same, this represents a node * that will be migrating to itself, so no pages need move. * * If no bits are left in 'tmp', or if all remaining bits left * in 'tmp' correspond to the same bit in 'to', return false * (nothing left to migrate). * * This lets us pick a pair of nodes to migrate between, such that * if possible the dest node is not already occupied by some other * source node, minimizing the risk of overloading the memory on a * node that would happen if we migrated incoming memory to a node * before migrating outgoing memory source that same node. * * A single scan of tmp is sufficient. As we go, we remember the * most recent <s, d> pair that moved (s != d). If we find a pair * that not only moved, but what's better, moved to an empty slot * (d is not set in tmp), then we break out then, with that pair. * Otherwise when we finish scanning from_tmp, we at least have the * most recent <s, d> pair that moved. If we get all the way through * the scan of tmp without finding any node that moved, much less * moved to an empty node, then there is nothing left worth migrating. */ tmp = *from; while (!nodes_empty(tmp)) { int s,d; int source = NUMA_NO_NODE; int dest = 0; for_each_node_mask(s, tmp) { /* * do_migrate_pages() tries to maintain the relative * node relationship of the pages established between * threads and memory areas. * * However if the number of source nodes is not equal to * the number of destination nodes we can not preserve * this node relative relationship. In that case, skip * copying memory from a node that is in the destination * mask. * * Example: [2,3,4] -> [3,4,5] moves everything. * [0-7] - > [3,4,5] moves only 0,1,2,6,7. */ if ((nodes_weight(*from) != nodes_weight(*to)) && (node_isset(s, *to))) continue; d = node_remap(s, *from, *to); if (s == d) continue; source = s; /* Node moved. Memorize */ dest = d; /* dest not in remaining from nodes? */ if (!node_isset(dest, tmp)) break; } if (source == NUMA_NO_NODE) break; node_clear(source, tmp); err = migrate_to_node(mm, source, dest, flags); if (err > 0) busy += err; if (err < 0) break; } mmap_read_unlock(mm); if (err < 0) return err; return busy; } /* * Allocate a new page for page migration based on vma policy. * Start by assuming the page is mapped by the same vma as contains @start. * Search forward from there, if not. N.B., this assumes that the * list of pages handed to migrate_pages()--which is how we get here-- * is in virtual address order. */ static struct page *new_page(struct page *page, unsigned long start) { struct vm_area_struct *vma; unsigned long address; vma = find_vma(current->mm, start); while (vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; vma = vma->vm_next; } if (PageHuge(page)) { return alloc_huge_page_vma(page_hstate(compound_head(page)), vma, address); } else if (PageTransHuge(page)) { struct page *thp; thp = alloc_hugepage_vma(GFP_TRANSHUGE, vma, address, HPAGE_PMD_ORDER); if (!thp) return NULL; prep_transhuge_page(thp); return thp; } /* * if !vma, alloc_page_vma() will use task or system default policy */ return alloc_page_vma(GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL, vma, address); } #else static int migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags) { return -EIO; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return -ENOSYS; } static struct page *new_page(struct page *page, unsigned long start) { return NULL; } #endif static long do_mbind(unsigned long start, unsigned long len, unsigned short mode, unsigned short mode_flags, nodemask_t *nmask, unsigned long flags) { struct mm_struct *mm = current->mm; struct mempolicy *new; unsigned long end; int err; int ret; LIST_HEAD(pagelist); if (flags & ~(unsigned long)MPOL_MF_VALID) return -EINVAL; if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) return -EINVAL; if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; len = (len + PAGE_SIZE - 1) & PAGE_MASK; end = start + len; if (end < start) return -EINVAL; if (end == start) return 0; new = mpol_new(mode, mode_flags, nmask); if (IS_ERR(new)) return PTR_ERR(new); if (flags & MPOL_MF_LAZY) new->flags |= MPOL_F_MOF; /* * If we are using the default policy then operation * on discontinuous address spaces is okay after all */ if (!new) flags |= MPOL_MF_DISCONTIG_OK; pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n", start, start + len, mode, mode_flags, nmask ? nodes_addr(*nmask)[0] : NUMA_NO_NODE); if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { err = migrate_prep(); if (err) goto mpol_out; } { NODEMASK_SCRATCH(scratch); if (scratch) { mmap_write_lock(mm); err = mpol_set_nodemask(new, nmask, scratch); if (err) mmap_write_unlock(mm); } else err = -ENOMEM; NODEMASK_SCRATCH_FREE(scratch); } if (err) goto mpol_out; ret = queue_pages_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); if (ret < 0) { err = ret; goto up_out; } err = mbind_range(mm, start, end, new); if (!err) { int nr_failed = 0; if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_page, NULL, start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); } if ((ret > 0) || (nr_failed && (flags & MPOL_MF_STRICT))) err = -EIO; } else { up_out: if (!list_empty(&pagelist)) putback_movable_pages(&pagelist); } mmap_write_unlock(mm); mpol_out: mpol_put(new); return err; } /* * User space interface with variable sized bitmaps for nodelists. */ /* Copy a node mask from user space. */ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; unsigned long t; unsigned long nlongs; unsigned long endmask; --maxnode; nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; if (maxnode > PAGE_SIZE*BITS_PER_BYTE) return -EINVAL; nlongs = BITS_TO_LONGS(maxnode); if ((maxnode % BITS_PER_LONG) == 0) endmask = ~0UL; else endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; /* * When the user specified more nodes than supported just check * if the non supported part is all zero. * * If maxnode have more longs than MAX_NUMNODES, check * the bits in that area first. And then go through to * check the rest bits which equal or bigger than MAX_NUMNODES. * Otherwise, just check bits [MAX_NUMNODES, maxnode). */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { if (t & endmask) return -EINVAL; } else if (t) return -EINVAL; } nlongs = BITS_TO_LONGS(MAX_NUMNODES); endmask = ~0UL; } if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { unsigned long valid_mask = endmask; valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); if (get_user(t, nmask + nlongs - 1)) return -EFAULT; if (t & valid_mask) return -EINVAL; } if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; nodes_addr(*nodes)[nlongs-1] &= endmask; return 0; } /* Copy a kernel node mask to user space */ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, nodemask_t *nodes) { unsigned long copy = ALIGN(maxnode-1, 64) / 8; unsigned int nbytes = BITS_TO_LONGS(nr_node_ids) * sizeof(long); if (copy > nbytes) { if (copy > PAGE_SIZE) return -EINVAL; if (clear_user((char __user *)mask + nbytes, copy - nbytes)) return -EFAULT; copy = nbytes; } return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } static long kernel_mbind(unsigned long start, unsigned long len, unsigned long mode, const unsigned long __user *nmask, unsigned long maxnode, unsigned int flags) { nodemask_t nodes; int err; unsigned short mode_flags; start = untagged_addr(start); mode_flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if (mode >= MPOL_MAX) return -EINVAL; if ((mode_flags & MPOL_F_STATIC_NODES) && (mode_flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_mbind(start, len, mode, mode_flags, &nodes, flags); } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) { return kernel_mbind(start, len, mode, nmask, maxnode, flags); } /* Set the process memory policy */ static long kernel_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode) { int err; nodemask_t nodes; unsigned short flags; flags = mode & MPOL_MODE_FLAGS; mode &= ~MPOL_MODE_FLAGS; if ((unsigned int)mode >= MPOL_MAX) return -EINVAL; if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES)) return -EINVAL; err = get_nodes(&nodes, nmask, maxnode); if (err) return err; return do_set_mempolicy(mode, flags, &nodes); } SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { return kernel_set_mempolicy(mode, nmask, maxnode); } static int kernel_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) { struct mm_struct *mm = NULL; struct task_struct *task; nodemask_t task_nodes; int err; nodemask_t *old; nodemask_t *new; NODEMASK_SCRATCH(scratch); if (!scratch) return -ENOMEM; old = &scratch->mask1; new = &scratch->mask2; err = get_nodes(old, old_nodes, maxnode); if (err) goto out; err = get_nodes(new, new_nodes, maxnode); if (err) goto out; /* Find the mm_struct */ rcu_read_lock(); task = pid ? find_task_by_vpid(pid) : current; if (!task) { rcu_read_unlock(); err = -ESRCH; goto out; } get_task_struct(task); err = -EINVAL; /* * Check if this process has the right to modify the specified process. * Use the regular "ptrace_may_access()" checks. */ if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out_put; } rcu_read_unlock(); task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out_put; } task_nodes = cpuset_mems_allowed(current); nodes_and(*new, *new, task_nodes); if (nodes_empty(*new)) goto out_put; err = security_task_movememory(task); if (err) goto out_put; mm = get_task_mm(task); put_task_struct(task); if (!mm) { err = -EINVAL; goto out; } err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); mmput(mm); out: NODEMASK_SCRATCH_FREE(scratch); return err; out_put: put_task_struct(task); goto out; } SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, old_nodes, const unsigned long __user *, new_nodes) { return kernel_migrate_pages(pid, maxnode, old_nodes, new_nodes); } /* Retrieve NUMA policy */ static int kernel_get_mempolicy(int __user *policy, unsigned long __user *nmask, unsigned long maxnode, unsigned long addr, unsigned long flags) { int err; int pval; nodemask_t nodes; if (nmask != NULL && maxnode < nr_node_ids) return -EINVAL; addr = untagged_addr(addr); err = do_get_mempolicy(&pval, &nodes, addr, flags); if (err) return err; if (policy && put_user(pval, policy)) return -EFAULT; if (nmask) err = copy_nodes_to_user(nmask, maxnode, &nodes); return err; } SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, unsigned long __user *, nmask, unsigned long, maxnode, unsigned long, addr, unsigned long, flags) { return kernel_get_mempolicy(policy, nmask, maxnode, addr, flags); } #ifdef CONFIG_COMPAT COMPAT_SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode, compat_ulong_t, addr, compat_ulong_t, flags) { long err; unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); nr_bits = min_t(unsigned long, maxnode-1, nr_node_ids); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) nm = compat_alloc_user_space(alloc_size); err = kernel_get_mempolicy(policy, nm, nr_bits+1, addr, flags); if (!err && nmask) { unsigned long copy_size; copy_size = min_t(unsigned long, sizeof(bm), alloc_size); err = copy_from_user(bm, nm, copy_size); /* ensure entire bitmap is zeroed */ err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8); err |= compat_put_bitmap(nmask, bm, nr_bits); } return err; } COMPAT_SYSCALL_DEFINE3(set_mempolicy, int, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode) { unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; DECLARE_BITMAP(bm, MAX_NUMNODES); nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { if (compat_get_bitmap(bm, nmask, nr_bits)) return -EFAULT; nm = compat_alloc_user_space(alloc_size); if (copy_to_user(nm, bm, alloc_size)) return -EFAULT; } return kernel_set_mempolicy(mode, nm, nr_bits+1); } COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, compat_ulong_t, mode, compat_ulong_t __user *, nmask, compat_ulong_t, maxnode, compat_ulong_t, flags) { unsigned long __user *nm = NULL; unsigned long nr_bits, alloc_size; nodemask_t bm; nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES); alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (nmask) { if (compat_get_bitmap(nodes_addr(bm), nmask, nr_bits)) return -EFAULT; nm = compat_alloc_user_space(alloc_size); if (copy_to_user(nm, nodes_addr(bm), alloc_size)) return -EFAULT; } return kernel_mbind(start, len, mode, nm, nr_bits+1, flags); } COMPAT_SYSCALL_DEFINE4(migrate_pages, compat_pid_t, pid, compat_ulong_t, maxnode, const compat_ulong_t __user *, old_nodes, const compat_ulong_t __user *, new_nodes) { unsigned long __user *old = NULL; unsigned long __user *new = NULL; nodemask_t tmp_mask; unsigned long nr_bits; unsigned long size; nr_bits = min_t(unsigned long, maxnode - 1, MAX_NUMNODES); size = ALIGN(nr_bits, BITS_PER_LONG) / 8; if (old_nodes) { if (compat_get_bitmap(nodes_addr(tmp_mask), old_nodes, nr_bits)) return -EFAULT; old = compat_alloc_user_space(new_nodes ? size * 2 : size); if (new_nodes) new = old + size / sizeof(unsigned long); if (copy_to_user(old, nodes_addr(tmp_mask), size)) return -EFAULT; } if (new_nodes) { if (compat_get_bitmap(nodes_addr(tmp_mask), new_nodes, nr_bits)) return -EFAULT; if (new == NULL) new = compat_alloc_user_space(size); if (copy_to_user(new, nodes_addr(tmp_mask), size)) return -EFAULT; } return kernel_migrate_pages(pid, nr_bits + 1, old, new); } #endif /* CONFIG_COMPAT */ bool vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return false; /* * DAX device mappings require predictable access latency, so avoid * incurring periodic faults. */ if (vma_is_dax(vma)) return false; if (is_vm_hugetlb_page(vma) && !hugepage_migration_supported(hstate_vma(vma))) return false; /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not * possible. */ if (vma->vm_file && gfp_zone(mapping_gfp_mask(vma->vm_file->f_mapping)) < policy_zone) return false; return true; } struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = NULL; if (vma) { if (vma->vm_ops && vma->vm_ops->get_policy) { pol = vma->vm_ops->get_policy(vma, addr); } else if (vma->vm_policy) { pol = vma->vm_policy; /* * shmem_alloc_page() passes MPOL_F_SHARED policy with * a pseudo vma whose vma->vm_ops=NULL. Take a reference * count on these policies which will be dropped by * mpol_cond_put() later */ if (mpol_needs_cond_ref(pol)) mpol_get(pol); } } return pol; } /* * get_vma_policy(@vma, @addr) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to current->mempolicy or system default policy, as necessary. * Shared policies [those marked as MPOL_F_SHARED] require an extra reference * count--added by the get_policy() vm_op, as appropriate--to protect against * freeing by another task. It is the caller's responsibility to free the * extra reference for shared policies. */ static struct mempolicy *get_vma_policy(struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = __get_vma_policy(vma, addr); if (!pol) pol = get_task_policy(current); return pol; } bool vma_policy_mof(struct vm_area_struct *vma) { struct mempolicy *pol; if (vma->vm_ops && vma->vm_ops->get_policy) { bool ret = false; pol = vma->vm_ops->get_policy(vma, vma->vm_start); if (pol && (pol->flags & MPOL_F_MOF)) ret = true; mpol_cond_put(pol); return ret; } pol = vma->vm_policy; if (!pol) pol = get_task_policy(current); return pol->flags & MPOL_F_MOF; } static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; BUG_ON(dynamic_policy_zone == ZONE_MOVABLE); /* * if policy->v.nodes has movable memory only, * we apply policy when gfp_zone(gfp) = ZONE_MOVABLE only. * * policy->v.nodes is intersect with node_states[N_MEMORY]. * so if the following test faile, it implies * policy->v.nodes has movable memory only. */ if (!nodes_intersects(policy->v.nodes, node_states[N_HIGH_MEMORY])) dynamic_policy_zone = ZONE_MOVABLE; return zone >= dynamic_policy_zone; } /* * Return a nodemask representing a mempolicy for filtering nodes for * page allocation */ nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy) { /* Lower zones don't get a nodemask applied for MPOL_BIND */ if (unlikely(policy->mode == MPOL_BIND) && apply_policy_zone(policy, gfp_zone(gfp)) && cpuset_nodemask_valid_mems_allowed(&policy->v.nodes)) return &policy->v.nodes; return NULL; } /* Return the node id preferred by the given mempolicy, or the given id */ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) { if (policy->mode == MPOL_PREFERRED && !(policy->flags & MPOL_F_LOCAL)) nd = policy->v.preferred_node; else { /* * __GFP_THISNODE shouldn't even be used with the bind policy * because we might easily break the expectation to stay on the * requested node and not break the policy. */ WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } return nd; } /* Do dynamic interleaving for a process */ static unsigned interleave_nodes(struct mempolicy *policy) { unsigned next; struct task_struct *me = current; next = next_node_in(me->il_prev, policy->v.nodes); if (next < MAX_NUMNODES) me->il_prev = next; return next; } /* * Depending on the memory policy provide a node from which to allocate the * next slab entry. */ unsigned int mempolicy_slab_node(void) { struct mempolicy *policy; int node = numa_mem_id(); if (in_interrupt()) return node; policy = current->mempolicy; if (!policy || policy->flags & MPOL_F_LOCAL) return node; switch (policy->mode) { case MPOL_PREFERRED: /* * handled MPOL_F_LOCAL above */ return policy->v.preferred_node; case MPOL_INTERLEAVE: return interleave_nodes(policy); case MPOL_BIND: { struct zoneref *z; /* * Follow bind policy behavior and start allocation at the * first node. */ struct zonelist *zonelist; enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL); zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); return z->zone ? zone_to_nid(z->zone) : node; } default: BUG(); } } /* * Do static interleaving for a VMA with known offset @n. Returns the n'th * node in pol->v.nodes (starting from n=0), wrapping around if n exceeds the * number of present nodes. */ static unsigned offset_il_node(struct mempolicy *pol, unsigned long n) { unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int i; int nid; if (!nnodes) return numa_node_id(); target = (unsigned int)n % nnodes; nid = first_node(pol->v.nodes); for (i = 0; i < target; i++) nid = next_node(nid, pol->v.nodes); return nid; } /* Determine a node number for interleave */ static inline unsigned interleave_nid(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long addr, int shift) { if (vma) { unsigned long off; /* * for small pages, there is no difference between * shift and PAGE_SHIFT, so the bit-shift is safe. * for huge pages, since vm_pgoff is in units of small * pages, we need to shift off the always 0 bits to get * a useful offset. */ BUG_ON(shift < PAGE_SHIFT); off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, off); } else return interleave_nodes(pol); } #ifdef CONFIG_HUGETLBFS /* * huge_node(@vma, @addr, @gfp_flags, @mpol) * @vma: virtual memory area whose policy is sought * @addr: address in @vma for shared policy lookup and interleave policy * @gfp_flags: for requested zone * @mpol: pointer to mempolicy pointer for reference counted mempolicy * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a nid suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. * * Must be protected by read_mems_allowed_begin() */ int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { int nid; *mpol = get_vma_policy(vma, addr); *nodemask = NULL; /* assume !MPOL_BIND */ if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) { nid = interleave_nid(*mpol, vma, addr, huge_page_shift(hstate_vma(vma))); } else { nid = policy_node(gfp_flags, *mpol, numa_node_id()); if ((*mpol)->mode == MPOL_BIND) *nodemask = &(*mpol)->v.nodes; } return nid; } /* * init_nodemask_of_mempolicy * * If the current task's mempolicy is "default" [NULL], return 'false' * to indicate default policy. Otherwise, extract the policy nodemask * for 'bind' or 'interleave' policy into the argument nodemask, or * initialize the argument nodemask to contain the single node for * 'preferred' or 'local' policy and return 'true' to indicate presence * of non-default mempolicy. * * We don't bother with reference counting the mempolicy [mpol_get/put] * because the current task is examining it's own mempolicy and a task's * mempolicy is only ever changed by the task itself. * * N.B., it is the caller's responsibility to free a returned nodemask. */ bool init_nodemask_of_mempolicy(nodemask_t *mask) { struct mempolicy *mempolicy; int nid; if (!(mask && current->mempolicy)) return false; task_lock(current); mempolicy = current->mempolicy; switch (mempolicy->mode) { case MPOL_PREFERRED: if (mempolicy->flags & MPOL_F_LOCAL) nid = numa_node_id(); else nid = mempolicy->v.preferred_node; init_nodemask_of_node(mask, nid); break; case MPOL_BIND: case MPOL_INTERLEAVE: *mask = mempolicy->v.nodes; break; default: BUG(); } task_unlock(current); return true; } #endif /* * mempolicy_nodemask_intersects * * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default * policy. Otherwise, check for intersection between mask and the policy * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' * policy, always return true since it may allocate elsewhere on fallback. * * Takes task_lock(tsk) to prevent freeing of its mempolicy. */ bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask) { struct mempolicy *mempolicy; bool ret = true; if (!mask) return ret; task_lock(tsk); mempolicy = tsk->mempolicy; if (!mempolicy) goto out; switch (mempolicy->mode) { case MPOL_PREFERRED: /* * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to * allocate from, they may fallback to other nodes when oom. * Thus, it's possible for tsk to have allocated memory from * nodes in mask. */ break; case MPOL_BIND: case MPOL_INTERLEAVE: ret = nodes_intersects(mempolicy->v.nodes, *mask); break; default: BUG(); } out: task_unlock(tsk); return ret; } /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, unsigned nid) { struct page *page; page = __alloc_pages(gfp, order, nid); /* skip NUMA_INTERLEAVE_HIT counter update if numa stats is disabled */ if (!static_branch_likely(&vm_numa_stat_key)) return page; if (page && page_to_nid(page) == nid) { preempt_disable(); __inc_numa_state(page_zone(page), NUMA_INTERLEAVE_HIT); preempt_enable(); } return page; } /** * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. * %GFP_KERNEL kernel allocations, * %GFP_HIGHMEM highmem/user allocations, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @node: Which node to prefer for allocation (modulo policy). * @hugepage: for hugepages try only the preferred node if possible * * This function allocates a page from the kernel page pool and applies * a NUMA policy associated with the VMA or the current process. * When VMA is not NULL caller must read-lock the mmap_lock of the * mm_struct of the VMA to prevent it from going away. Should be used for * all allocations for pages that will be mapped into user space. Returns * NULL when no page can be allocated. */ struct page * alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage) { struct mempolicy *pol; struct page *page; int preferred_nid; nodemask_t *nmask; pol = get_vma_policy(vma, addr); if (pol->mode == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); mpol_cond_put(pol); page = alloc_page_interleave(gfp, order, nid); goto out; } if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { int hpage_node = node; /* * For hugepage allocation and non-interleave policy which * allows the current node (or other explicitly preferred * node) we only try to allocate from the current/preferred * node and don't fall back to other nodes, as the cost of * remote accesses would likely offset THP benefits. * * If the policy is interleave, or does not allow the current * node in its nodemask, we allocate the standard way. */ if (pol->mode == MPOL_PREFERRED && !(pol->flags & MPOL_F_LOCAL)) hpage_node = pol->v.preferred_node; nmask = policy_nodemask(gfp, pol); if (!nmask || node_isset(hpage_node, *nmask)) { mpol_cond_put(pol); /* * First, try to allocate THP only on local node, but * don't reclaim unnecessarily, just compact. */ page = __alloc_pages_node(hpage_node, gfp | __GFP_THISNODE | __GFP_NORETRY, order); /* * If hugepage allocations are configured to always * synchronous compact or the vma has been madvised * to prefer hugepage backing, retry allowing remote * memory with both reclaim and compact as well. */ if (!page && (gfp & __GFP_DIRECT_RECLAIM)) page = __alloc_pages_node(hpage_node, gfp, order); goto out; } } nmask = policy_nodemask(gfp, pol); preferred_nid = policy_node(gfp, pol, node); page = __alloc_pages_nodemask(gfp, order, preferred_nid, nmask); mpol_cond_put(pol); out: return page; } EXPORT_SYMBOL(alloc_pages_vma); /** * alloc_pages_current - Allocate pages. * * @gfp: * %GFP_USER user allocation, * %GFP_KERNEL kernel allocation, * %GFP_HIGHMEM highmem allocation, * %GFP_FS don't call back into a file system. * %GFP_ATOMIC don't sleep. * @order: Power of two of allocation size in pages. 0 is a single page. * * Allocate a page from the kernel page pool. When not in * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. */ struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = &default_policy; struct page *page; if (!in_interrupt() && !(gfp & __GFP_THISNODE)) pol = get_task_policy(current); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else page = __alloc_pages_nodemask(gfp, order, policy_node(gfp, pol, numa_node_id()), policy_nodemask(gfp, pol)); return page; } EXPORT_SYMBOL(alloc_pages_current); int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { struct mempolicy *pol = mpol_dup(vma_policy(src)); if (IS_ERR(pol)) return PTR_ERR(pol); dst->vm_policy = pol; return 0; } /* * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it * rebinds the mempolicy its copying by calling mpol_rebind_policy() * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). * * current's mempolicy may be rebinded by the other task(the task that changes * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ struct mempolicy *__mpol_dup(struct mempolicy *old) { struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* task's mempolicy is protected by alloc_lock */ if (old == current->mempolicy) { task_lock(current); *new = *old; task_unlock(current); } else *new = *old; if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); mpol_rebind_policy(new, &mems); } atomic_set(&new->refcnt, 1); return new; } /* Slow path of a mempolicy comparison */ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (!a || !b) return false; if (a->mode != b->mode) return false; if (a->flags != b->flags) return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; switch (a->mode) { case MPOL_BIND: case MPOL_INTERLEAVE: return !!nodes_equal(a->v.nodes, b->v.nodes); case MPOL_PREFERRED: /* a's ->flags is the same as b's */ if (a->flags & MPOL_F_LOCAL) return true; return a->v.preferred_node == b->v.preferred_node; default: BUG(); return false; } } /* * Shared memory backing store policy support. * * Remember policies even when nobody has shared memory mapped. * The policies are kept in Red-Black tree linked from the inode. * They are protected by the sp->lock rwlock, which should be held * for any accesses to the tree. */ /* * lookup first element intersecting start-end. Caller holds sp->lock for * reading or for writing */ static struct sp_node * sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end) { struct rb_node *n = sp->root.rb_node; while (n) { struct sp_node *p = rb_entry(n, struct sp_node, nd); if (start >= p->end) n = n->rb_right; else if (end <= p->start) n = n->rb_left; else break; } if (!n) return NULL; for (;;) { struct sp_node *w = NULL; struct rb_node *prev = rb_prev(n); if (!prev) break; w = rb_entry(prev, struct sp_node, nd); if (w->end <= start) break; n = prev; } return rb_entry(n, struct sp_node, nd); } /* * Insert a new shared policy into the list. Caller holds sp->lock for * writing. */ static void sp_insert(struct shared_policy *sp, struct sp_node *new) { struct rb_node **p = &sp->root.rb_node; struct rb_node *parent = NULL; struct sp_node *nd; while (*p) { parent = *p; nd = rb_entry(parent, struct sp_node, nd); if (new->start < nd->start) p = &(*p)->rb_left; else if (new->end > nd->end) p = &(*p)->rb_right; else BUG(); } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, new->policy ? new->policy->mode : 0); } /* Find shared policy intersecting idx */ struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) { struct mempolicy *pol = NULL; struct sp_node *sn; if (!sp->root.rb_node) return NULL; read_lock(&sp->lock); sn = sp_lookup(sp, idx, idx+1); if (sn) { mpol_get(sn->policy); pol = sn->policy; } read_unlock(&sp->lock); return pol; } static void sp_free(struct sp_node *n) { mpol_put(n->policy); kmem_cache_free(sn_cache, n); } /** * mpol_misplaced - check whether current page node is valid in policy * * @page: page to be checked * @vma: vm area where page mapped * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. * * Returns: * -1 - not misplaced, page is in the right node * node - node id where the page should be * * Policy determination "mimics" alloc_page_vma(). * Called from fault path where we know the vma and faulting address. */ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol; struct zoneref *z; int curnid = page_to_nid(page); unsigned long pgoff; int thiscpu = raw_smp_processor_id(); int thisnid = cpu_to_node(thiscpu); int polnid = NUMA_NO_NODE; int ret = -1; pol = get_vma_policy(vma, addr); if (!(pol->flags & MPOL_F_MOF)) goto out; switch (pol->mode) { case MPOL_INTERLEAVE: pgoff = vma->vm_pgoff; pgoff += (addr - vma->vm_start) >> PAGE_SHIFT; polnid = offset_il_node(pol, pgoff); break; case MPOL_PREFERRED: if (pol->flags & MPOL_F_LOCAL) polnid = numa_node_id(); else polnid = pol->v.preferred_node; break; case MPOL_BIND: /* * allows binding to multiple nodes. * use current page if in policy nodemask, * else select nearest allowed node, if any. * If no allowed nodes, use current [!misplaced]. */ if (node_isset(curnid, pol->v.nodes)) goto out; z = first_zones_zonelist( node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->v.nodes); polnid = zone_to_nid(z->zone); break; default: BUG(); } /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { polnid = thisnid; if (!should_numa_migrate_memory(current, page, curnid, thiscpu)) goto out; } if (curnid != polnid) ret = polnid; out: mpol_cond_put(pol); return ret; } /* * Drop the (possibly final) reference to task->mempolicy. It needs to be * dropped after task->mempolicy is set to NULL so that any allocation done as * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed * policy. */ void mpol_put_task_policy(struct task_struct *task) { struct mempolicy *pol; task_lock(task); pol = task->mempolicy; task->mempolicy = NULL; task_unlock(task); mpol_put(pol); } static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); sp_free(n); } static void sp_node_init(struct sp_node *node, unsigned long start, unsigned long end, struct mempolicy *pol) { node->start = start; node->end = end; node->policy = pol; } static struct sp_node *sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) { struct sp_node *n; struct mempolicy *newpol; n = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n) return NULL; newpol = mpol_dup(pol); if (IS_ERR(newpol)) { kmem_cache_free(sn_cache, n); return NULL; } newpol->flags |= MPOL_F_SHARED; sp_node_init(n, start, end, newpol); return n; } /* Replace a policy range. */ static int shared_policy_replace(struct shared_policy *sp, unsigned long start, unsigned long end, struct sp_node *new) { struct sp_node *n; struct sp_node *n_new = NULL; struct mempolicy *mpol_new = NULL; int ret = 0; restart: write_lock(&sp->lock); n = sp_lookup(sp, start, end); /* Take care of old policies in the same range. */ while (n && n->start < end) { struct rb_node *next = rb_next(&n->nd); if (n->start >= start) { if (n->end <= end) sp_delete(sp, n); else n->start = end; } else { /* Old policy spanning whole new range. */ if (n->end > end) { if (!n_new) goto alloc_new; *mpol_new = *n->policy; atomic_set(&mpol_new->refcnt, 1); sp_node_init(n_new, end, n->end, mpol_new); n->end = start; sp_insert(sp, n_new); n_new = NULL; mpol_new = NULL; break; } else n->end = start; } if (!next) break; n = rb_entry(next, struct sp_node, nd); } if (new) sp_insert(sp, new); write_unlock(&sp->lock); ret = 0; err_out: if (mpol_new) mpol_put(mpol_new); if (n_new) kmem_cache_free(sn_cache, n_new); return ret; alloc_new: write_unlock(&sp->lock); ret = -ENOMEM; n_new = kmem_cache_alloc(sn_cache, GFP_KERNEL); if (!n_new) goto err_out; mpol_new = kmem_cache_alloc(policy_cache, GFP_KERNEL); if (!mpol_new) goto err_out; goto restart; } /** * mpol_shared_policy_init - initialize shared policy for inode * @sp: pointer to inode shared policy * @mpol: struct mempolicy to install * * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { int ret; sp->root = RB_ROOT; /* empty tree == default mempolicy */ rwlock_init(&sp->lock); if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; NODEMASK_SCRATCH(scratch); if (!scratch) goto put_mpol; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); if (IS_ERR(new)) goto free_scratch; /* no valid nodemask intersection */ task_lock(current); ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); if (ret) goto put_new; /* Create pseudo-vma that contains just the policy */ vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ put_new: mpol_put(new); /* drop initial ref */ free_scratch: NODEMASK_SCRATCH_FREE(scratch); put_mpol: mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, struct mempolicy *npol) { int err; struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n", vma->vm_pgoff, sz, npol ? npol->mode : -1, npol ? npol->flags : -1, npol ? nodes_addr(npol->v.nodes)[0] : NUMA_NO_NODE); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); if (!new) return -ENOMEM; } err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new); if (err && new) sp_free(new); return err; } /* Free a backing policy store on inode delete. */ void mpol_free_shared_policy(struct shared_policy *p) { struct sp_node *n; struct rb_node *next; if (!p->root.rb_node) return; write_lock(&p->lock); next = rb_first(&p->root); while (next) { n = rb_entry(next, struct sp_node, nd); next = rb_next(&n->nd); sp_delete(p, n); } write_unlock(&p->lock); } #ifdef CONFIG_NUMA_BALANCING static int __initdata numabalancing_override; static void __init check_numabalancing_enable(void) { bool numabalancing_default = false; if (IS_ENABLED(CONFIG_NUMA_BALANCING_DEFAULT_ENABLED)) numabalancing_default = true; /* Parsed by setup_numabalancing. override == 1 enables, -1 disables */ if (numabalancing_override) set_numabalancing_state(numabalancing_override == 1); if (num_online_nodes() > 1 && !numabalancing_override) { pr_info("%s automatic NUMA balancing. Configure with numa_balancing= or the kernel.numa_balancing sysctl\n", numabalancing_default ? "Enabling" : "Disabling"); set_numabalancing_state(numabalancing_default); } } static int __init setup_numabalancing(char *str) { int ret = 0; if (!str) goto out; if (!strcmp(str, "enable")) { numabalancing_override = 1; ret = 1; } else if (!strcmp(str, "disable")) { numabalancing_override = -1; ret = 1; } out: if (!ret) pr_warn("Unable to parse numa_balancing=\n"); return ret; } __setup("numa_balancing=", setup_numabalancing); #else static inline void __init check_numabalancing_enable(void) { } #endif /* CONFIG_NUMA_BALANCING */ /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { nodemask_t interleave_nodes; unsigned long largest = 0; int nid, prefer = 0; policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), 0, SLAB_PANIC, NULL); for_each_node(nid) { preferred_node_policy[nid] = (struct mempolicy) { .refcnt = ATOMIC_INIT(1), .mode = MPOL_PREFERRED, .flags = MPOL_F_MOF | MPOL_F_MORON, .v = { .preferred_node = nid, }, }; } /* * Set interleaving policy for system init. Interleaving is only * enabled across suitably sized nodes (default is >= 16MB), or * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); for_each_node_state(nid, N_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ if (largest < total_pages) { largest = total_pages; prefer = nid; } /* Interleave this node? */ if ((total_pages << PAGE_SHIFT) >= (16 << 20)) node_set(nid, interleave_nodes); } /* All too small, use the largest */ if (unlikely(nodes_empty(interleave_nodes))) node_set(prefer, interleave_nodes); if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes)) pr_err("%s: interleaving failed\n", __func__); check_numabalancing_enable(); } /* Reset policy of current process to default */ void numa_default_policy(void) { do_set_mempolicy(MPOL_DEFAULT, 0, NULL); } /* * Parse and format mempolicy from/to strings */ /* * "local" is implemented internally by MPOL_PREFERRED with MPOL_F_LOCAL flag. */ static const char * const policy_modes[] = { [MPOL_DEFAULT] = "default", [MPOL_PREFERRED] = "prefer", [MPOL_BIND] = "bind", [MPOL_INTERLEAVE] = "interleave", [MPOL_LOCAL] = "local", }; #ifdef CONFIG_TMPFS /** * mpol_parse_str - parse string to mempolicy, for tmpfs mpol mount option. * @str: string containing mempolicy to parse * @mpol: pointer to struct mempolicy pointer, returned on success. * * Format of input: * <mode>[=<flags>][:<nodelist>] * * On success, returns 0, else 1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { struct mempolicy *new = NULL; unsigned short mode_flags; nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); int err = 1, mode; if (flags) *flags++ = '\0'; /* terminate mode string */ if (nodelist) { /* NUL-terminate mode or flags string */ *nodelist++ = '\0'; if (nodelist_parse(nodelist, nodes)) goto out; if (!nodes_subset(nodes, node_states[N_MEMORY])) goto out; } else nodes_clear(nodes); mode = match_string(policy_modes, MPOL_MAX, str); if (mode < 0) goto out; switch (mode) { case MPOL_PREFERRED: /* * Insist on a nodelist of one node only, although later * we use first_node(nodes) to grab a single node, so here * nodelist (or nodes) cannot be empty. */ if (nodelist) { char *rest = nodelist; while (isdigit(*rest)) rest++; if (*rest) goto out; if (nodes_empty(nodes)) goto out; } break; case MPOL_INTERLEAVE: /* * Default to online nodes with memory if no nodelist */ if (!nodelist) nodes = node_states[N_MEMORY]; break; case MPOL_LOCAL: /* * Don't allow a nodelist; mpol_new() checks flags */ if (nodelist) goto out; mode = MPOL_PREFERRED; break; case MPOL_DEFAULT: /* * Insist on a empty nodelist */ if (!nodelist) err = 0; goto out; case MPOL_BIND: /* * Insist on a nodelist */ if (!nodelist) goto out; } mode_flags = 0; if (flags) { /* * Currently, we only support two mutually exclusive * mode flags. */ if (!strcmp(flags, "static")) mode_flags |= MPOL_F_STATIC_NODES; else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) goto out; /* * Save nodes for mpol_to_str() to show the tmpfs mount options * for /proc/mounts, /proc/pid/mounts and /proc/pid/mountinfo. */ if (mode != MPOL_PREFERRED) new->v.nodes = nodes; else if (nodelist) new->v.preferred_node = first_node(nodes); else new->flags |= MPOL_F_LOCAL; /* * Save nodes for contextualization: this will be used to "clone" * the mempolicy in a specific context [cpuset] at a later time. */ new->w.user_nodemask = nodes; err = 0; out: /* Restore string for error message */ if (nodelist) *--nodelist = ':'; if (flags) *--flags = '='; if (!err) *mpol = new; return err; } #endif /* CONFIG_TMPFS */ /** * mpol_to_str - format a mempolicy structure for printing * @buffer: to contain formatted mempolicy string * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * * Convert @pol into a string. If @buffer is too short, truncate the string. * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the * longest flag, "relative", and to display at least a few node ids. */ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; nodemask_t nodes = NODE_MASK_NONE; unsigned short mode = MPOL_DEFAULT; unsigned short flags = 0; if (pol && pol != &default_policy && !(pol->flags & MPOL_F_MORON)) { mode = pol->mode; flags = pol->flags; } switch (mode) { case MPOL_DEFAULT: break; case MPOL_PREFERRED: if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; case MPOL_BIND: case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; default: WARN_ON_ONCE(1); snprintf(p, maxlen, "unknown"); return; } p += snprintf(p, maxlen, "%s", policy_modes[mode]); if (flags & MPOL_MODE_FLAGS) { p += snprintf(p, buffer + maxlen - p, "="); /* * Currently, the only defined flags are mutually exclusive */ if (flags & MPOL_F_STATIC_NODES) p += snprintf(p, buffer + maxlen - p, "static"); else if (flags & MPOL_F_RELATIVE_NODES) p += snprintf(p, buffer + maxlen - p, "relative"); } if (!nodes_empty(nodes)) p += scnprintf(p, buffer + maxlen - p, ":%*pbl", nodemask_pr_args(&nodes)); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_USER_NAMESPACE_H #define _LINUX_USER_NAMESPACE_H #include <linux/kref.h> #include <linux/nsproxy.h> #include <linux/ns_common.h> #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/rwsem.h> #include <linux/sysctl.h> #include <linux/err.h> #define UID_GID_MAP_MAX_BASE_EXTENTS 5 #define UID_GID_MAP_MAX_EXTENTS 340 struct uid_gid_extent { u32 first; u32 lower_first; u32 count; }; struct uid_gid_map { /* 64 bytes -- 1 cache line */ u32 nr_extents; union { struct uid_gid_extent extent[UID_GID_MAP_MAX_BASE_EXTENTS]; struct { struct uid_gid_extent *forward; struct uid_gid_extent *reverse; }; }; }; #define USERNS_SETGROUPS_ALLOWED 1UL #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED struct ucounts; enum ucount_type { UCOUNT_USER_NAMESPACES, UCOUNT_PID_NAMESPACES, UCOUNT_UTS_NAMESPACES, UCOUNT_IPC_NAMESPACES, UCOUNT_NET_NAMESPACES, UCOUNT_MNT_NAMESPACES, UCOUNT_CGROUP_NAMESPACES, UCOUNT_TIME_NAMESPACES, #ifdef CONFIG_INOTIFY_USER UCOUNT_INOTIFY_INSTANCES, UCOUNT_INOTIFY_WATCHES, #endif UCOUNT_COUNTS, }; struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; struct uid_gid_map projid_map; atomic_t count; struct user_namespace *parent; int level; kuid_t owner; kgid_t group; struct ns_common ns; unsigned long flags; /* parent_could_setfcap: true if the creator if this ns had CAP_SETFCAP * in its effective capability set at the child ns creation time. */ bool parent_could_setfcap; #ifdef CONFIG_KEYS /* List of joinable keyrings in this namespace. Modification access of * these pointers is controlled by keyring_sem. Once * user_keyring_register is set, it won't be changed, so it can be * accessed directly with READ_ONCE(). */ struct list_head keyring_name_list; struct key *user_keyring_register; struct rw_semaphore keyring_sem; #endif /* Register of per-UID persistent keyrings for this namespace */ #ifdef CONFIG_PERSISTENT_KEYRINGS struct key *persistent_keyring_register; #endif struct work_struct work; #ifdef CONFIG_SYSCTL struct ctl_table_set set; struct ctl_table_header *sysctls; #endif struct ucounts *ucounts; int ucount_max[UCOUNT_COUNTS]; } __randomize_layout; struct ucounts { struct hlist_node node; struct user_namespace *ns; kuid_t uid; int count; atomic_t ucount[UCOUNT_COUNTS]; }; extern struct user_namespace init_user_ns; bool setup_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type); #ifdef CONFIG_USER_NS static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { if (ns) atomic_inc(&ns->count); return ns; } extern int create_user_ns(struct cred *new); extern int unshare_userns(unsigned long unshare_flags, struct cred **new_cred); extern void __put_user_ns(struct user_namespace *ns); static inline void put_user_ns(struct user_namespace *ns) { if (ns && atomic_dec_and_test(&ns->count)) __put_user_ns(ns); } struct seq_operations; extern const struct seq_operations proc_uid_seq_operations; extern const struct seq_operations proc_gid_seq_operations; extern const struct seq_operations proc_projid_seq_operations; extern ssize_t proc_uid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_gid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_projid_map_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t proc_setgroups_write(struct file *, const char __user *, size_t, loff_t *); extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); extern bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child); extern bool current_in_userns(const struct user_namespace *target_ns); struct ns_common *ns_get_owner(struct ns_common *ns); #else static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; } static inline int create_user_ns(struct cred *new) { return -EINVAL; } static inline int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { if (unshare_flags & CLONE_NEWUSER) return -EINVAL; return 0; } static inline void put_user_ns(struct user_namespace *ns) { } static inline bool userns_may_setgroups(const struct user_namespace *ns) { return true; } static inline bool in_userns(const struct user_namespace *ancestor, const struct user_namespace *child) { return true; } static inline bool current_in_userns(const struct user_namespace *target_ns) { return true; } static inline struct ns_common *ns_get_owner(struct ns_common *ns) { return ERR_PTR(-EPERM); } #endif #endif /* _LINUX_USER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Symmetric key ciphers. * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_SKCIPHER_H #define _CRYPTO_SKCIPHER_H #include <linux/crypto.h> #include <linux/kernel.h> #include <linux/slab.h> /** * struct skcipher_request - Symmetric key cipher request * @cryptlen: Number of bytes to encrypt or decrypt * @iv: Initialisation Vector * @src: Source SG list * @dst: Destination SG list * @base: Underlying async request * @__ctx: Start of private context data */ struct skcipher_request { unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; struct crypto_async_request base; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; struct crypto_skcipher { unsigned int reqsize; struct crypto_tfm base; }; struct crypto_sync_skcipher { struct crypto_skcipher base; }; /** * struct skcipher_alg - symmetric key cipher definition * @min_keysize: Minimum key size supported by the transformation. This is the * smallest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MIN_KEY_SIZE" include/crypto/ * @max_keysize: Maximum key size supported by the transformation. This is the * largest key length supported by this transformation algorithm. * This must be set to one of the pre-defined values as this is * not hardware specific. Possible values for this field can be * found via git grep "_MAX_KEY_SIZE" include/crypto/ * @setkey: Set key for the transformation. This function is used to either * program a supplied key into the hardware or store the key in the * transformation context for programming it later. Note that this * function does modify the transformation context. This function can * be called multiple times during the existence of the transformation * object, so one must make sure the key is properly reprogrammed into * the hardware. This function is also responsible for checking the key * length for validity. In case a software fallback was put in place in * the @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. * @encrypt: Encrypt a scatterlist of blocks. This function is used to encrypt * the supplied scatterlist containing the blocks of data. The crypto * API consumer is responsible for aligning the entries of the * scatterlist properly and making sure the chunks are correctly * sized. In case a software fallback was put in place in the * @cra_init call, this function might need to use the fallback if * the algorithm doesn't support all of the key sizes. In case the * key was stored in transformation context, the key might need to be * re-programmed into the hardware in this function. This function * shall not modify the transformation context, as this function may * be called in parallel with the same transformation object. * @decrypt: Decrypt a single block. This is a reverse counterpart to @encrypt * and the conditions are exactly the same. * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @ivsize: IV size applicable for transformation. The consumer must provide an * IV of exactly that size to perform the encrypt or decrypt operation. * @chunksize: Equal to the block size except for stream ciphers such as * CTR where it is set to the underlying block size. * @walksize: Equal to the chunk size except in cases where the algorithm is * considerably more efficient if it can operate on multiple chunks * in parallel. Should be a multiple of chunksize. * @base: Definition of a generic crypto algorithm. * * All fields except @ivsize are mandatory and must be filled. */ struct skcipher_alg { int (*setkey)(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); int (*encrypt)(struct skcipher_request *req); int (*decrypt)(struct skcipher_request *req); int (*init)(struct crypto_skcipher *tfm); void (*exit)(struct crypto_skcipher *tfm); unsigned int min_keysize; unsigned int max_keysize; unsigned int ivsize; unsigned int chunksize; unsigned int walksize; struct crypto_alg base; }; #define MAX_SYNC_SKCIPHER_REQSIZE 384 /* * This performs a type-check against the "tfm" argument to make sure * all users have the correct skcipher tfm for doing on-stack requests. */ #define SYNC_SKCIPHER_REQUEST_ON_STACK(name, tfm) \ char __##name##_desc[sizeof(struct skcipher_request) + \ MAX_SYNC_SKCIPHER_REQSIZE + \ (!(sizeof((struct crypto_sync_skcipher *)1 == \ (typeof(tfm))1))) \ ] CRYPTO_MINALIGN_ATTR; \ struct skcipher_request *name = (void *)__##name##_desc /** * DOC: Symmetric Key Cipher API * * Symmetric key cipher API is used with the ciphers of type * CRYPTO_ALG_TYPE_SKCIPHER (listed as type "skcipher" in /proc/crypto). * * Asynchronous cipher operations imply that the function invocation for a * cipher request returns immediately before the completion of the operation. * The cipher request is scheduled as a separate kernel thread and therefore * load-balanced on the different CPUs via the process scheduler. To allow * the kernel crypto API to inform the caller about the completion of a cipher * request, the caller must provide a callback function. That function is * invoked with the cipher handle when the request completes. * * To support the asynchronous operation, additional information than just the * cipher handle must be supplied to the kernel crypto API. That additional * information is given by filling in the skcipher_request data structure. * * For the symmetric key cipher API, the state is maintained with the tfm * cipher handle. A single tfm can be used across multiple calls and in * parallel. For asynchronous block cipher calls, context data supplied and * only used by the caller can be referenced the request data structure in * addition to the IV used for the cipher request. The maintenance of such * state information would be important for a crypto driver implementer to * have, because when calling the callback function upon completion of the * cipher operation, that callback function may need some information about * which operation just finished if it invoked multiple in parallel. This * state information is unused by the kernel crypto API. */ static inline struct crypto_skcipher *__crypto_skcipher_cast( struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_skcipher, base); } /** * crypto_alloc_skcipher() - allocate symmetric key cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an skcipher. The returned struct * crypto_skcipher is the cipher handle that is required for any subsequent * API invocation for that skcipher. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name, u32 type, u32 mask); struct crypto_sync_skcipher *crypto_alloc_sync_skcipher(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_skcipher_tfm( struct crypto_skcipher *tfm) { return &tfm->base; } /** * crypto_free_skcipher() - zeroize and free cipher handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_skcipher(struct crypto_skcipher *tfm) { crypto_destroy_tfm(tfm, crypto_skcipher_tfm(tfm)); } static inline void crypto_free_sync_skcipher(struct crypto_sync_skcipher *tfm) { crypto_free_skcipher(&tfm->base); } /** * crypto_has_skcipher() - Search for the availability of an skcipher. * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * skcipher * @type: specifies the type of the skcipher * @mask: specifies the mask for the skcipher * * Return: true when the skcipher is known to the kernel crypto API; false * otherwise */ int crypto_has_skcipher(const char *alg_name, u32 type, u32 mask); static inline const char *crypto_skcipher_driver_name( struct crypto_skcipher *tfm) { return crypto_tfm_alg_driver_name(crypto_skcipher_tfm(tfm)); } static inline struct skcipher_alg *crypto_skcipher_alg( struct crypto_skcipher *tfm) { return container_of(crypto_skcipher_tfm(tfm)->__crt_alg, struct skcipher_alg, base); } static inline unsigned int crypto_skcipher_alg_ivsize(struct skcipher_alg *alg) { return alg->ivsize; } /** * crypto_skcipher_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the skcipher referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_skcipher_ivsize(struct crypto_skcipher *tfm) { return crypto_skcipher_alg(tfm)->ivsize; } static inline unsigned int crypto_sync_skcipher_ivsize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_ivsize(&tfm->base); } /** * crypto_skcipher_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the skcipher referenced with the cipher handle is * returned. The caller may use that information to allocate appropriate * memory for the data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_skcipher_blocksize( struct crypto_skcipher *tfm) { return crypto_tfm_alg_blocksize(crypto_skcipher_tfm(tfm)); } static inline unsigned int crypto_skcipher_alg_chunksize( struct skcipher_alg *alg) { return alg->chunksize; } /** * crypto_skcipher_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CTR. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_skcipher_chunksize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg_chunksize(crypto_skcipher_alg(tfm)); } static inline unsigned int crypto_sync_skcipher_blocksize( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_blocksize(&tfm->base); } static inline unsigned int crypto_skcipher_alignmask( struct crypto_skcipher *tfm) { return crypto_tfm_alg_alignmask(crypto_skcipher_tfm(tfm)); } static inline u32 crypto_skcipher_get_flags(struct crypto_skcipher *tfm) { return crypto_tfm_get_flags(crypto_skcipher_tfm(tfm)); } static inline void crypto_skcipher_set_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_set_flags(crypto_skcipher_tfm(tfm), flags); } static inline void crypto_skcipher_clear_flags(struct crypto_skcipher *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_skcipher_tfm(tfm), flags); } static inline u32 crypto_sync_skcipher_get_flags( struct crypto_sync_skcipher *tfm) { return crypto_skcipher_get_flags(&tfm->base); } static inline void crypto_sync_skcipher_set_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_set_flags(&tfm->base, flags); } static inline void crypto_sync_skcipher_clear_flags( struct crypto_sync_skcipher *tfm, u32 flags) { crypto_skcipher_clear_flags(&tfm->base, flags); } /** * crypto_skcipher_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the skcipher referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_skcipher_setkey(struct crypto_skcipher *tfm, const u8 *key, unsigned int keylen); static inline int crypto_sync_skcipher_setkey(struct crypto_sync_skcipher *tfm, const u8 *key, unsigned int keylen) { return crypto_skcipher_setkey(&tfm->base, key, keylen); } static inline unsigned int crypto_skcipher_min_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg(tfm)->min_keysize; } static inline unsigned int crypto_skcipher_max_keysize( struct crypto_skcipher *tfm) { return crypto_skcipher_alg(tfm)->max_keysize; } /** * crypto_skcipher_reqtfm() - obtain cipher handle from request * @req: skcipher_request out of which the cipher handle is to be obtained * * Return the crypto_skcipher handle when furnishing an skcipher_request * data structure. * * Return: crypto_skcipher handle */ static inline struct crypto_skcipher *crypto_skcipher_reqtfm( struct skcipher_request *req) { return __crypto_skcipher_cast(req->base.tfm); } static inline struct crypto_sync_skcipher *crypto_sync_skcipher_reqtfm( struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); return container_of(tfm, struct crypto_sync_skcipher, base); } /** * crypto_skcipher_encrypt() - encrypt plaintext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_encrypt(struct skcipher_request *req); /** * crypto_skcipher_decrypt() - decrypt ciphertext * @req: reference to the skcipher_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the skcipher_request handle. That data * structure and how it is filled with data is discussed with the * skcipher_request_* functions. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_skcipher_decrypt(struct skcipher_request *req); /** * DOC: Symmetric Key Cipher Request Handle * * The skcipher_request data structure contains all pointers to data * required for the symmetric key cipher operation. This includes the cipher * handle (which can be used by multiple skcipher_request instances), pointer * to plaintext and ciphertext, asynchronous callback function, etc. It acts * as a handle to the skcipher_request_* API calls in a similar way as * skcipher handle to the crypto_skcipher_* API calls. */ /** * crypto_skcipher_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_skcipher_reqsize(struct crypto_skcipher *tfm) { return tfm->reqsize; } /** * skcipher_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing skcipher handle in the request * data structure with a different one. */ static inline void skcipher_request_set_tfm(struct skcipher_request *req, struct crypto_skcipher *tfm) { req->base.tfm = crypto_skcipher_tfm(tfm); } static inline void skcipher_request_set_sync_tfm(struct skcipher_request *req, struct crypto_sync_skcipher *tfm) { skcipher_request_set_tfm(req, &tfm->base); } static inline struct skcipher_request *skcipher_request_cast( struct crypto_async_request *req) { return container_of(req, struct skcipher_request, base); } /** * skcipher_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the skcipher * encrypt and decrypt API calls. During the allocation, the provided skcipher * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct skcipher_request *skcipher_request_alloc( struct crypto_skcipher *tfm, gfp_t gfp) { struct skcipher_request *req; req = kmalloc(sizeof(struct skcipher_request) + crypto_skcipher_reqsize(tfm), gfp); if (likely(req)) skcipher_request_set_tfm(req, tfm); return req; } /** * skcipher_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void skcipher_request_free(struct skcipher_request *req) { kfree_sensitive(req); } static inline void skcipher_request_zero(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); memzero_explicit(req, sizeof(*req) + crypto_skcipher_reqsize(tfm)); } /** * skcipher_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * This function allows setting the callback function that is triggered once the * cipher operation completes. * * The callback function is registered with the skcipher_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void skcipher_request_set_callback(struct skcipher_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * skcipher_request_set_crypt() - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_skcipher_ivsize * * This function allows setting of the source data and destination data * scatter / gather lists. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. */ static inline void skcipher_request_set_crypt( struct skcipher_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, void *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } #endif /* _CRYPTO_SKCIPHER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_MLD_H #define LINUX_MLD_H #include <linux/in6.h> #include <linux/icmpv6.h> /* MLDv1 Query/Report/Done */ struct mld_msg { struct icmp6hdr mld_hdr; struct in6_addr mld_mca; }; #define mld_type mld_hdr.icmp6_type #define mld_code mld_hdr.icmp6_code #define mld_cksum mld_hdr.icmp6_cksum #define mld_maxdelay mld_hdr.icmp6_maxdelay #define mld_reserved mld_hdr.icmp6_dataun.un_data16[1] /* Multicast Listener Discovery version 2 headers */ /* MLDv2 Report */ struct mld2_grec { __u8 grec_type; __u8 grec_auxwords; __be16 grec_nsrcs; struct in6_addr grec_mca; struct in6_addr grec_src[]; }; struct mld2_report { struct icmp6hdr mld2r_hdr; struct mld2_grec mld2r_grec[]; }; #define mld2r_type mld2r_hdr.icmp6_type #define mld2r_resv1 mld2r_hdr.icmp6_code #define mld2r_cksum mld2r_hdr.icmp6_cksum #define mld2r_resv2 mld2r_hdr.icmp6_dataun.un_data16[0] #define mld2r_ngrec mld2r_hdr.icmp6_dataun.un_data16[1] /* MLDv2 Query */ struct mld2_query { struct icmp6hdr mld2q_hdr; struct in6_addr mld2q_mca; #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 mld2q_qrv:3, mld2q_suppress:1, mld2q_resv2:4; #elif defined(__BIG_ENDIAN_BITFIELD) __u8 mld2q_resv2:4, mld2q_suppress:1, mld2q_qrv:3; #else #error "Please fix <asm/byteorder.h>" #endif __u8 mld2q_qqic; __be16 mld2q_nsrcs; struct in6_addr mld2q_srcs[]; }; #define mld2q_type mld2q_hdr.icmp6_type #define mld2q_code mld2q_hdr.icmp6_code #define mld2q_cksum mld2q_hdr.icmp6_cksum #define mld2q_mrc mld2q_hdr.icmp6_maxdelay #define mld2q_resv1 mld2q_hdr.icmp6_dataun.un_data16[1] /* RFC3810, 5.1.3. Maximum Response Code: * * If Maximum Response Code >= 32768, Maximum Response Code represents a * floating-point value as follows: * * 0 1 2 3 4 5 6 7 8 9 A B C D E F * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ */ #define MLDV2_MRC_EXP(value) (((value) >> 12) & 0x0007) #define MLDV2_MRC_MAN(value) ((value) & 0x0fff) /* RFC3810, 5.1.9. QQIC (Querier's Query Interval Code): * * If QQIC >= 128, QQIC represents a floating-point value as follows: * * 0 1 2 3 4 5 6 7 * +-+-+-+-+-+-+-+-+ * |1| exp | mant | * +-+-+-+-+-+-+-+-+ */ #define MLDV2_QQIC_EXP(value) (((value) >> 4) & 0x07) #define MLDV2_QQIC_MAN(value) ((value) & 0x0f) #define MLD_EXP_MIN_LIMIT 32768UL #define MLDV1_MRD_MAX_COMPAT (MLD_EXP_MIN_LIMIT - 1) static inline unsigned long mldv2_mrc(const struct mld2_query *mlh2) { /* RFC3810, 5.1.3. Maximum Response Code */ unsigned long ret, mc_mrc = ntohs(mlh2->mld2q_mrc); if (mc_mrc < MLD_EXP_MIN_LIMIT) { ret = mc_mrc; } else { unsigned long mc_man, mc_exp; mc_exp = MLDV2_MRC_EXP(mc_mrc); mc_man = MLDV2_MRC_MAN(mc_mrc); ret = (mc_man | 0x1000) << (mc_exp + 3); } return ret; } #endif
1 1 1 2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 4476 4477 4478 4479 4480 4481 4482 4483 4484 4485 4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496 4497 4498 4499 4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 4530 4531 4532 4533 4534 4535 4536 4537 4538 4539 4540 4541 4542 4543 4544 4545 4546 4547 4548 4549 4550 4551 4552 4553 4554 4555 4556 4557 4558 4559 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607 4608 4609 4610 4611 4612 4613 4614 4615 4616 4617 4618 4619 4620 4621 4622 4623 4624 4625 4626 4627 4628 4629 4630 4631 4632 4633 4634 4635 4636 4637 4638 4639 4640 4641 4642 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664 4665 4666 4667 4668 4669 4670 4671 4672 4673 4674 4675 4676 4677 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 4693 4694 4695 4696 4697 4698 4699 4700 4701 4702 4703 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 4729 4730 4731 4732 4733 4734 4735 4736 4737 4738 4739 4740 4741 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 4776 4777 4778 4779 4780 4781 4782 4783 4784 4785 4786 4787 4788 4789 4790 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 4820 4821 4822 4823 4824 4825 4826 4827 4828 4829 4830 4831 4832 4833 4834 4835 4836 4837 4838 4839 4840 4841 4842 4843 4844 4845 4846 4847 4848 4849 4850 4851 4852 4853 4854 4855 4856 4857 4858 4859 4860 4861 4862 4863 4864 4865 4866 4867 4868 4869 4870 4871 4872 4873 4874 4875 4876 4877 4878 4879 4880 4881 4882 4883 4884 4885 4886 4887 4888 4889 4890 4891 4892 4893 4894 4895 4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908 4909 4910 4911 4912 4913 4914 4915 4916 4917 4918 4919 4920 4921 4922 4923 4924 4925 4926 4927 4928 4929 4930 4931 4932 4933 4934 4935 4936 4937 4938 4939 4940 4941 4942 4943 4944 4945 4946 4947 4948 4949 4950 4951 4952 4953 4954 4955 4956 4957 4958 4959 4960 4961 4962 4963 4964 4965 4966 4967 4968 4969 4970 4971 4972 4973 4974 4975 4976 4977 4978 4979 4980 4981 4982 4983 4984 4985 4986 4987 4988 4989 4990 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 5005 5006 5007 5008 5009 5010 5011 5012 5013 5014 5015 5016 5017 5018 5019 5020 5021 5022 5023 5024 5025 5026 5027 5028 5029 5030 5031 5032 5033 5034 5035 5036 5037 5038 5039 5040 5041 5042 5043 5044 5045 5046 5047 5048 5049 5050 5051 5052 5053 5054 5055 5056 5057 5058 5059 5060 5061 5062 5063 5064 5065 5066 5067 5068 5069 5070 5071 5072 5073 5074 5075 5076 5077 5078 5079 5080 5081 5082 5083 5084 5085 5086 5087 5088 5089 5090 5091 5092 5093 5094 5095 5096 5097 5098 5099 5100 5101 5102 5103 5104 5105 5106 5107 5108 5109 5110 5111 5112 5113 5114 5115 5116 5117 5118 5119 5120 5121 5122 5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 5147 5148 5149 5150 5151 5152 5153 5154 5155 5156 5157 5158 5159 5160 5161 5162 5163 5164 5165 5166 5167 5168 5169 5170 5171 5172 5173 5174 5175 5176 5177 5178 5179 5180 5181 5182 5183 5184 5185 5186 5187 5188 5189 5190 5191 5192 5193 5194 5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210 5211 5212 5213 5214 5215 5216 5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230 5231 5232 5233 5234 5235 5236 5237 5238 5239 5240 5241 5242 5243 5244 5245 5246 5247 5248 5249 5250 5251 5252 5253 5254 5255 5256 5257 5258 5259 5260 5261 5262 5263 5264 5265 5266 5267 5268 5269 5270 5271 5272 5273 5274 5275 5276 5277 5278 5279 5280 5281 5282 5283 5284 5285 5286 5287 5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301 5302 5303 5304 5305 5306 5307 // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c * * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds */ /* * demand-loading started 01.12.91 - seems it is high on the list of * things wanted, and it should be easy to implement. - Linus */ /* * Ok, demand-loading was easy, shared pages a little bit tricker. Shared * pages started 02.12.91, seems to work. - Linus. * * Tested sharing by executing about 30 /bin/sh: under the old kernel it * would have taken more than the 6M I have free, but it worked well as * far as I could see. * * Also corrected some "invalidate()"s - I wasn't doing enough of them. */ /* * Real VM (paging to/from disk) started 18.12.91. Much more work and * thought has to go into this. Oh, well.. * 19.12.91 - works, somewhat. Sometimes I get faults, don't know why. * Found it. Everything seems to work now. * 20.12.91 - Ok, making the swap-device changeable like the root. */ /* * 05.04.94 - Multi-page memory management added for v1.1. * Idea by Alex Bligh (alex@cconcepts.co.uk) * * 16.07.99 - Support of BIGMEM added by Gerhard Wichert, Siemens AG * (Gerhard.Wichert@pdb.siemens.de) * * Aug/Sep 2004 Changed to four level page tables (Andi Kleen) */ #include <linux/kernel_stat.h> #include <linux/mm.h> #include <linux/sched/mm.h> #include <linux/sched/coredump.h> #include <linux/sched/numa_balancing.h> #include <linux/sched/task.h> #include <linux/hugetlb.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> #include <linux/memremap.h> #include <linux/ksm.h> #include <linux/rmap.h> #include <linux/export.h> #include <linux/delayacct.h> #include <linux/init.h> #include <linux/pfn_t.h> #include <linux/writeback.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/swapops.h> #include <linux/elf.h> #include <linux/gfp.h> #include <linux/migrate.h> #include <linux/string.h> #include <linux/debugfs.h> #include <linux/userfaultfd_k.h> #include <linux/dax.h> #include <linux/oom.h> #include <linux/numa.h> #include <linux/perf_event.h> #include <linux/ptrace.h> #include <linux/vmalloc.h> #include <trace/events/kmem.h> #include <asm/io.h> #include <asm/mmu_context.h> #include <asm/pgalloc.h> #include <linux/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> #include "pgalloc-track.h" #include "internal.h" #if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST) #warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; EXPORT_SYMBOL(max_mapnr); struct page *mem_map; EXPORT_SYMBOL(mem_map); #endif /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end * of ZONE_NORMAL. Under CONFIG_DISCONTIG this means that max_low_pfn and * highstart_pfn must be the same; there must be no gap between ZONE_NORMAL * and ZONE_HIGHMEM. */ void *high_memory; EXPORT_SYMBOL(high_memory); /* * Randomize the address space (stacks, mmaps, brk, etc.). * * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization, * as ancient (libc5 based) binaries can segfault. ) */ int randomize_va_space __read_mostly = #ifdef CONFIG_COMPAT_BRK 1; #else 2; #endif #ifndef arch_faults_on_old_pte static inline bool arch_faults_on_old_pte(void) { /* * Those arches which don't have hw access flag feature need to * implement their own helper. By default, "true" means pagefault * will be hit on old pte. */ return true; } #endif static int __init disable_randmaps(char *s) { randomize_va_space = 0; return 1; } __setup("norandmaps", disable_randmaps); unsigned long zero_pfn __read_mostly; EXPORT_SYMBOL(zero_pfn); unsigned long highest_memmap_pfn __read_mostly; /* * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() */ static int __init init_zero_pfn(void) { zero_pfn = page_to_pfn(ZERO_PAGE(0)); return 0; } early_initcall(init_zero_pfn); void mm_trace_rss_stat(struct mm_struct *mm, int member, long count) { trace_rss_stat(mm, member, count); } #if defined(SPLIT_RSS_COUNTING) void sync_mm_rss(struct mm_struct *mm) { int i; for (i = 0; i < NR_MM_COUNTERS; i++) { if (current->rss_stat.count[i]) { add_mm_counter(mm, i, current->rss_stat.count[i]); current->rss_stat.count[i] = 0; } } current->rss_stat.events = 0; } static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) { struct task_struct *task = current; if (likely(task->mm == mm)) task->rss_stat.count[member] += val; else add_mm_counter(mm, member, val); } #define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) #define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) /* sync counter once per 64 page faults */ #define TASK_RSS_EVENTS_THRESH (64) static void check_sync_rss_stat(struct task_struct *task) { if (unlikely(task != current)) return; if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) sync_mm_rss(task->mm); } #else /* SPLIT_RSS_COUNTING */ #define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) #define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) static void check_sync_rss_stat(struct task_struct *task) { } #endif /* SPLIT_RSS_COUNTING */ /* * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); mm_dec_nr_ptes(tlb->mm); } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; unsigned long start; start = addr; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; if (start < floor) return; if (ceiling) { ceiling &= PUD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; unsigned long start; start = addr; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); start &= P4D_MASK; if (start < floor) return; if (ceiling) { ceiling &= P4D_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; pud = pud_offset(p4d, start); p4d_clear(p4d); pud_free_tlb(tlb, pud, start); mm_dec_nr_puds(tlb->mm); } static inline void free_p4d_range(struct mmu_gather *tlb, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { p4d_t *p4d; unsigned long next; unsigned long start; start = addr; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; free_pud_range(tlb, p4d, addr, next, floor, ceiling); } while (p4d++, addr = next, addr != end); start &= PGDIR_MASK; if (start < floor) return; if (ceiling) { ceiling &= PGDIR_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) return; p4d = p4d_offset(pgd, start); pgd_clear(pgd); p4d_free_tlb(tlb, p4d, start); } /* * This function frees user-level page tables of a process. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; /* * The next few lines have given us lots of grief... * * Why are we testing PMD* at this top level? Because often * there will be no work to do at all, and we'd prefer not to * go all the way down to the bottom just to discover that. * * Why all these "- 1"s? Because 0 represents both the bottom * of the address space and the top of it (using -1 for the * top wouldn't help much: the masks would do the wrong thing). * The rule is that addr 0 and floor 0 refer to the bottom of * the address space, but end 0 and ceiling 0 refer to the top * Comparisons need to use "end - 1" and "ceiling - 1" (though * that end 0 case should be mythical). * * Wherever addr is brought up or ceiling brought down, we must * be careful to reject "the opposite 0" before it confuses the * subsequent tests. But what about where end is brought down * by PMD_SIZE below? no, end can't go down to 0 there. * * Whereas we round start (addr) and ceiling down, by different * masks at different levels, in order to test whether a table * now has no other vmas using it, so can be freed, we don't * bother to round floor or end up - the tests don't need that. */ addr &= PMD_MASK; if (addr < floor) { addr += PMD_SIZE; if (!addr) return; } if (ceiling) { ceiling &= PMD_MASK; if (!ceiling) return; } if (end - 1 > ceiling - 1) end -= PMD_SIZE; if (addr > end - 1) return; /* * We add page table cache pages with PAGE_SIZE, * (see pte_free_tlb()), flush the tlb if we need */ tlb_change_page_size(tlb, PAGE_SIZE); pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; free_p4d_range(tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); } void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long floor, unsigned long ceiling) { while (vma) { struct vm_area_struct *next = vma->vm_next; unsigned long addr = vma->vm_start; /* * Hide vma from rmap and truncate_pagecache before freeing * pgtables */ unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { hugetlb_free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } else { /* * Optimization: gather nearby vmas into one call down */ while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, floor, next ? next->vm_start : ceiling); } vma = next; } } int __pte_alloc(struct mm_struct *mm, pmd_t *pmd) { spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm); if (!new) return -ENOMEM; /* * Ensure all pte setup (eg. pte page lock and page clearing) are * visible before the pte is made visible to other CPUs by being * put into page tables. * * The other side of the story is the pointer chasing in the page * table walking code (when walking the page table without locking; * ie. most of the time). Fortunately, these data accesses consist * of a chain of data-dependent loads, meaning most CPUs (alpha * being the notable exception) will already guarantee loads are * seen in-order. See the alpha page table accessors for the * smp_rmb() barriers in page table walking code. */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ ptl = pmd_lock(mm, pmd); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm_inc_nr_ptes(mm); pmd_populate(mm, pmd, new); new = NULL; } spin_unlock(ptl); if (new) pte_free(mm, new); return 0; } int __pte_alloc_kernel(pmd_t *pmd) { pte_t *new = pte_alloc_one_kernel(&init_mm); if (!new) return -ENOMEM; smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; } spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); return 0; } static inline void init_rss_vec(int *rss) { memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); } static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { int i; if (current->mm == mm) sync_mm_rss(mm); for (i = 0; i < NR_MM_COUNTERS; i++) if (rss[i]) add_mm_counter(mm, i, rss[i]); } /* * This function is called to print an error when a bad pte * is found. For example, we might have a PFN-mapped pte in * a region that doesn't allow it. * * The calling function must still handle the error. */ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, pte_t pte, struct page *page) { pgd_t *pgd = pgd_offset(vma->vm_mm, addr); p4d_t *p4d = p4d_offset(pgd, addr); pud_t *pud = pud_offset(p4d, addr); pmd_t *pmd = pmd_offset(pud, addr); struct address_space *mapping; pgoff_t index; static unsigned long resume; static unsigned long nr_shown; static unsigned long nr_unshown; /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. */ if (nr_shown == 60) { if (time_before(jiffies, resume)) { nr_unshown++; return; } if (nr_unshown) { pr_alert("BUG: Bad page map: %lu messages suppressed\n", nr_unshown); nr_unshown = 0; } nr_shown = 0; } if (nr_shown++ == 0) resume = jiffies + 60 * HZ; mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL; index = linear_page_index(vma, addr); pr_alert("BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); if (page) dump_page(page, "bad pte"); pr_alert("addr:%px vm_flags:%08lx anon_vma:%px mapping:%px index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); pr_alert("file:%pD fault:%ps mmap:%ps readpage:%ps\n", vma->vm_file, vma->vm_ops ? vma->vm_ops->fault : NULL, vma->vm_file ? vma->vm_file->f_op->mmap : NULL, mapping ? mapping->a_ops->readpage : NULL); dump_stack(); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * * "Special" mappings do not wish to be associated with a "struct page" (either * it doesn't exist, or it exists but they don't want to touch it). In this * case, NULL is returned here. "Normal" mappings do have a struct page. * * There are 2 broad cases. Firstly, an architecture may define a pte_special() * pte bit, in which case this function is trivial. Secondly, an architecture * may not have a spare pte bit, which requires a more complicated scheme, * described below. * * A raw VM_PFNMAP mapping (ie. one that is not COWed) is always considered a * special mapping (even if there are underlying and valid "struct pages"). * COWed pages of a VM_PFNMAP are always normal. * * The way we recognize COWed pages within VM_PFNMAP mappings is through the * rules set up by "remap_pfn_range()": the vma will have the VM_PFNMAP bit * set, and the vm_pgoff will point to the first PFN mapped: thus every special * mapping will always honor the rule * * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) * * And for normal mappings this is false. * * This restricts such mappings to be a linear translation from virtual address * to pfn. To get around this restriction, we allow arbitrary mappings so long * as the vma is not a COW mapping; in that case, we know that all ptes are * special (because none can have been COWed). * * * In order to support COW of arbitrary special mappings, we have VM_MIXEDMAP. * * VM_MIXEDMAP mappings can likewise contain memory with or without "struct * page" backing, however the difference is that _all_ pages with a struct * page (that is, those where pfn_valid is true) are refcounted and considered * normal pages by the VM. The disadvantage is that pages are refcounted * (which can be slower and simply not an option for some PFNMAP users). The * advantage is that we don't have to follow the strict linearity rule of * PFNMAP mappings in order to support COWable mappings. * */ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { unsigned long pfn = pte_pfn(pte); if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) return vma->vm_ops->find_special_page(vma, addr); if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; if (is_zero_pfn(pfn)) return NULL; if (pte_devmap(pte)) return NULL; print_bad_pte(vma, addr, pte, NULL); return NULL; } /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (is_zero_pfn(pfn)) return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd) { unsigned long pfn = pmd_pfn(pmd); /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { if (!pfn_valid(pfn)) return NULL; goto out; } else { unsigned long off; off = (addr - vma->vm_start) >> PAGE_SHIFT; if (pfn == vma->vm_pgoff + off) return NULL; if (!is_cow_mapping(vma->vm_flags)) return NULL; } } if (pmd_devmap(pmd)) return NULL; if (is_huge_zero_pmd(pmd)) return NULL; if (unlikely(pfn > highest_memmap_pfn)) return NULL; /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. */ out: return pfn_to_page(pfn); } #endif /* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. */ static unsigned long copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, unsigned long addr, int *rss) { unsigned long vm_flags = dst_vma->vm_flags; pte_t pte = *src_pte; struct page *page; swp_entry_t entry = pte_to_swp_entry(pte); if (likely(!non_swap_entry(entry))) { if (swap_duplicate(entry) < 0) return entry.val; /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); if (list_empty(&dst_mm->mmlist)) list_add(&dst_mm->mmlist, &src_mm->mmlist); spin_unlock(&mmlist_lock); } rss[MM_SWAPENTS]++; } else if (is_migration_entry(entry)) { page = migration_entry_to_page(entry); rss[mm_counter(page)]++; if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both * parent and child to be set to read. */ make_migration_entry_read(&entry); pte = swp_entry_to_pte(entry); if (pte_swp_soft_dirty(*src_pte)) pte = pte_swp_mksoft_dirty(pte); if (pte_swp_uffd_wp(*src_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } else if (is_device_private_entry(entry)) { page = device_private_entry_to_page(entry); /* * Update rss count even for unaddressable pages, as * they should treated just like normal pages in this * respect. * * We will likely want to have some new rss counters * for unaddressable pages, at some point. But for now * keep things as they are. */ get_page(page); rss[mm_counter(page)]++; page_dup_rmap(page, false); /* * We do not preserve soft-dirty information, because so * far, checkpoint/restore is the only feature that * requires that. And checkpoint/restore does not work * when a device driver is involved (you cannot easily * save and restore device driver state). */ if (is_write_device_private_entry(entry) && is_cow_mapping(vm_flags)) { make_device_private_entry_read(&entry); pte = swp_entry_to_pte(entry); if (pte_swp_uffd_wp(*src_pte)) pte = pte_swp_mkuffd_wp(pte); set_pte_at(src_mm, addr, src_pte, pte); } } if (!userfaultfd_wp(dst_vma)) pte = pte_swp_clear_uffd_wp(pte); set_pte_at(dst_mm, addr, dst_pte, pte); return 0; } /* * Copy a present and normal page if necessary. * * NOTE! The usual case is that this doesn't need to do * anything, and can just return a positive value. That * will let the caller know that it can just increase * the page refcount and re-use the pte the traditional * way. * * But _if_ we need to copy it because it needs to be * pinned in the parent (and the child should get its own * copy rather than just a reference to the same page), * we'll do that here and return zero to let the caller * know we're done. * * And if we need a pre-allocated page but don't yet have * one, return a negative error to let the preallocation * code know so that it can do so outside the page table * lock. */ static inline int copy_present_page(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct page **prealloc, pte_t pte, struct page *page) { struct mm_struct *src_mm = src_vma->vm_mm; struct page *new_page; if (!is_cow_mapping(src_vma->vm_flags)) return 1; /* * What we want to do is to check whether this page may * have been pinned by the parent process. If so, * instead of wrprotect the pte on both sides, we copy * the page immediately so that we'll always guarantee * the pinned page won't be randomly replaced in the * future. * * The page pinning checks are just "has this mm ever * seen pinning", along with the (inexact) check of * the page count. That might give false positives for * for pinning, but it will work correctly. */ if (likely(!atomic_read(&src_mm->has_pinned))) return 1; if (likely(!page_maybe_dma_pinned(page))) return 1; new_page = *prealloc; if (!new_page) return -EAGAIN; /* * We have a prealloc page, all good! Take it * over and copy the page & arm it. */ *prealloc = NULL; copy_user_highpage(new_page, page, addr, src_vma); __SetPageUptodate(new_page); page_add_new_anon_rmap(new_page, dst_vma, addr, false); lru_cache_add_inactive_or_unevictable(new_page, dst_vma); rss[mm_counter(new_page)]++; /* All done, just insert the new page copy in the child */ pte = mk_pte(new_page, dst_vma->vm_page_prot); pte = maybe_mkwrite(pte_mkdirty(pte), dst_vma); if (userfaultfd_pte_wp(dst_vma, *src_pte)) /* Uffd-wp needs to be delivered to dest pte as well */ pte = pte_wrprotect(pte_mkuffd_wp(pte)); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } /* * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page * is required to copy this pte. */ static inline int copy_present_pte(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pte_t *dst_pte, pte_t *src_pte, unsigned long addr, int *rss, struct page **prealloc) { struct mm_struct *src_mm = src_vma->vm_mm; unsigned long vm_flags = src_vma->vm_flags; pte_t pte = *src_pte; struct page *page; page = vm_normal_page(src_vma, addr, pte); if (page) { int retval; retval = copy_present_page(dst_vma, src_vma, dst_pte, src_pte, addr, rss, prealloc, pte, page); if (retval <= 0) return retval; get_page(page); page_dup_rmap(page, false); rss[mm_counter(page)]++; } /* * If it's a COW mapping, write protect it both * in the parent and the child */ if (is_cow_mapping(vm_flags) && pte_write(pte)) { ptep_set_wrprotect(src_mm, addr, src_pte); pte = pte_wrprotect(pte); } /* * If it's a shared mapping, mark it clean in * the child */ if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); if (!userfaultfd_wp(dst_vma)) pte = pte_clear_uffd_wp(pte); set_pte_at(dst_vma->vm_mm, addr, dst_pte, pte); return 0; } static inline struct page * page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma, unsigned long addr) { struct page *new_page; new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr); if (!new_page) return NULL; if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) { put_page(new_page); return NULL; } cgroup_throttle_swaprate(new_page, GFP_KERNEL); return new_page; } static int copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress, ret = 0; int rss[NR_MM_COUNTERS]; swp_entry_t entry = (swp_entry_t){0}; struct page *prealloc = NULL; again: progress = 0; init_rss_vec(rss); dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) { ret = -ENOMEM; goto out; } src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); orig_src_pte = src_pte; orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { /* * We are holding two locks at this point - either of them * could generate latencies in another task on another CPU. */ if (progress >= 32) { progress = 0; if (need_resched() || spin_needbreak(src_ptl) || spin_needbreak(dst_ptl)) break; } if (pte_none(*src_pte)) { progress++; continue; } if (unlikely(!pte_present(*src_pte))) { entry.val = copy_nonpresent_pte(dst_mm, src_mm, dst_pte, src_pte, dst_vma, src_vma, addr, rss); if (entry.val) break; progress += 8; continue; } /* copy_present_pte() will clear `*prealloc' if consumed */ ret = copy_present_pte(dst_vma, src_vma, dst_pte, src_pte, addr, rss, &prealloc); /* * If we need a pre-allocated page for this pte, drop the * locks, allocate, and try again. */ if (unlikely(ret == -EAGAIN)) break; if (unlikely(prealloc)) { /* * pre-alloc page cannot be reused by next time so as * to strictly follow mempolicy (e.g., alloc_page_vma() * will allocate page according to address). This * could only happen if one pinned pte changed. */ put_page(prealloc); prealloc = NULL; } progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); pte_unmap(orig_src_pte); add_mm_rss_vec(dst_mm, rss); pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); if (entry.val) { if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) { ret = -ENOMEM; goto out; } entry.val = 0; } else if (ret) { WARN_ON_ONCE(ret != -EAGAIN); prealloc = page_copy_prealloc(src_mm, src_vma, addr); if (!prealloc) return -ENOMEM; /* We've captured and resolved the error. Reset, try again. */ ret = 0; } if (addr != end) goto again; out: if (unlikely(prealloc)) put_page(prealloc); return ret; } static inline int copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pmd_t *src_pmd, *dst_pmd; unsigned long next; dst_pmd = pmd_alloc(dst_mm, dst_pud, addr); if (!dst_pmd) return -ENOMEM; src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*src_pmd) || pmd_trans_huge(*src_pmd) || pmd_devmap(*src_pmd)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PMD_SIZE, src_vma); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, dst_vma, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd, addr, next)) return -ENOMEM; } while (dst_pmd++, src_pmd++, addr = next, addr != end); return 0; } static inline int copy_pud_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, p4d_t *dst_p4d, p4d_t *src_p4d, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; pud_t *src_pud, *dst_pud; unsigned long next; dst_pud = pud_alloc(dst_mm, dst_p4d, addr); if (!dst_pud) return -ENOMEM; src_pud = pud_offset(src_p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*src_pud) || pud_devmap(*src_pud)) { int err; VM_BUG_ON_VMA(next-addr != HPAGE_PUD_SIZE, src_vma); err = copy_huge_pud(dst_mm, src_mm, dst_pud, src_pud, addr, src_vma); if (err == -ENOMEM) return -ENOMEM; if (!err) continue; /* fall through */ } if (pud_none_or_clear_bad(src_pud)) continue; if (copy_pmd_range(dst_vma, src_vma, dst_pud, src_pud, addr, next)) return -ENOMEM; } while (dst_pud++, src_pud++, addr = next, addr != end); return 0; } static inline int copy_p4d_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, pgd_t *dst_pgd, pgd_t *src_pgd, unsigned long addr, unsigned long end) { struct mm_struct *dst_mm = dst_vma->vm_mm; p4d_t *src_p4d, *dst_p4d; unsigned long next; dst_p4d = p4d_alloc(dst_mm, dst_pgd, addr); if (!dst_p4d) return -ENOMEM; src_p4d = p4d_offset(src_pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(src_p4d)) continue; if (copy_pud_range(dst_vma, src_vma, dst_p4d, src_p4d, addr, next)) return -ENOMEM; } while (dst_p4d++, src_p4d++, addr = next, addr != end); return 0; } int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma) { pgd_t *src_pgd, *dst_pgd; unsigned long next; unsigned long addr = src_vma->vm_start; unsigned long end = src_vma->vm_end; struct mm_struct *dst_mm = dst_vma->vm_mm; struct mm_struct *src_mm = src_vma->vm_mm; struct mmu_notifier_range range; bool is_cow; int ret; /* * Don't copy ptes where a page fault will fill them correctly. * Fork becomes much lighter when there are big shared or private * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ if (!(src_vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) && !src_vma->anon_vma) return 0; if (is_vm_hugetlb_page(src_vma)) return copy_hugetlb_page_range(dst_mm, src_mm, src_vma); if (unlikely(src_vma->vm_flags & VM_PFNMAP)) { /* * We do not free on error cases below as remove_vma * gets called on error from higher level routine */ ret = track_pfn_copy(src_vma); if (ret) return ret; } /* * We need to invalidate the secondary MMU mappings only when * there could be a permission downgrade on the ptes of the * parent mm. And a permission downgrade will only happen if * is_cow_mapping() returns true. */ is_cow = is_cow_mapping(src_vma->vm_flags); if (is_cow) { mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, src_vma, src_mm, addr, end); mmu_notifier_invalidate_range_start(&range); /* * Disabling preemption is not needed for the write side, as * the read side doesn't spin, but goes to the mmap_lock. * * Use the raw variant of the seqcount_t write API to avoid * lockdep complaining about preemptibility. */ mmap_assert_write_locked(src_mm); raw_write_seqcount_begin(&src_mm->write_protect_seq); } ret = 0; dst_pgd = pgd_offset(dst_mm, addr); src_pgd = pgd_offset(src_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(src_pgd)) continue; if (unlikely(copy_p4d_range(dst_vma, src_vma, dst_pgd, src_pgd, addr, next))) { ret = -ENOMEM; break; } } while (dst_pgd++, src_pgd++, addr = next, addr != end); if (is_cow) { raw_write_seqcount_end(&src_mm->write_protect_seq); mmu_notifier_invalidate_range_end(&range); } return ret; } static unsigned long zap_pte_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, struct zap_details *details) { struct mm_struct *mm = tlb->mm; int force_flush = 0; int rss[NR_MM_COUNTERS]; spinlock_t *ptl; pte_t *start_pte; pte_t *pte; swp_entry_t entry; tlb_change_page_size(tlb, PAGE_SIZE); again: init_rss_vec(rss); start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte = start_pte; flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); do { pte_t ptent = *pte; if (pte_none(ptent)) continue; if (need_resched()) break; if (pte_present(ptent)) { struct page *page; page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to * invalidate cache without truncating: * unmap shared but keep private pages. */ if (details->check_mapping && details->check_mapping != page_rmapping(page)) continue; } ptent = ptep_get_and_clear_full(mm, addr, pte, tlb->fullmm); tlb_remove_tlb_entry(tlb, pte, addr); if (unlikely(!page)) continue; if (!PageAnon(page)) { if (pte_dirty(ptent)) { force_flush = 1; set_page_dirty(page); } if (pte_young(ptent) && likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); } rss[mm_counter(page)]--; page_remove_rmap(page, false); if (unlikely(page_mapcount(page) < 0)) print_bad_pte(vma, addr, ptent, page); if (unlikely(__tlb_remove_page(tlb, page))) { force_flush = 1; addr += PAGE_SIZE; break; } continue; } entry = pte_to_swp_entry(ptent); if (is_device_private_entry(entry)) { struct page *page = device_private_entry_to_page(entry); if (unlikely(details && details->check_mapping)) { /* * unmap_shared_mapping_pages() wants to * invalidate cache without truncating: * unmap shared but keep private pages. */ if (details->check_mapping != page_rmapping(page)) continue; } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); rss[mm_counter(page)]--; page_remove_rmap(page, false); put_page(page); continue; } /* If details->check_mapping, we leave swap entries. */ if (unlikely(details)) continue; if (!non_swap_entry(entry)) rss[MM_SWAPENTS]--; else if (is_migration_entry(entry)) { struct page *page; page = migration_entry_to_page(entry); rss[mm_counter(page)]--; } if (unlikely(!free_swap_and_cache(entry))) print_bad_pte(vma, addr, ptent, NULL); pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, addr != end); add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); /* Do the actual TLB flush before dropping ptl */ if (force_flush) tlb_flush_mmu_tlbonly(tlb); pte_unmap_unlock(start_pte, ptl); /* * If we forced a TLB flush (either due to running out of * batch buffers or because we needed to flush dirty TLB * entries before releasing the ptl), free the batched * memory too. Restart if we didn't do everything. */ if (force_flush) { force_flush = 0; tlb_flush_mmu(tlb); } if (addr != end) { cond_resched(); goto again; } return addr; } static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, struct zap_details *details) { pmd_t *pmd; unsigned long next; pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) __split_huge_pmd(vma, pmd, addr, false, NULL); else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ } else if (details && details->single_page && PageTransCompound(details->single_page) && next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { spinlock_t *ptl = pmd_lock(tlb->mm, pmd); /* * Take and drop THP pmd lock so that we cannot return * prematurely, while zap_huge_pmd() has cleared *pmd, * but not yet decremented compound_mapcount(). */ spin_unlock(ptl); } /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is * none or trans huge it can change under us. This is * because MADV_DONTNEED holds the mmap_lock in read * mode. */ if (pmd_none_or_trans_huge_or_clear_bad(pmd)) goto next; next = zap_pte_range(tlb, vma, pmd, addr, next, details); next: cond_resched(); } while (pmd++, addr = next, addr != end); return addr; } static inline unsigned long zap_pud_range(struct mmu_gather *tlb, struct vm_area_struct *vma, p4d_t *p4d, unsigned long addr, unsigned long end, struct zap_details *details) { pud_t *pud; unsigned long next; pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); if (pud_trans_huge(*pud) || pud_devmap(*pud)) { if (next - addr != HPAGE_PUD_SIZE) { mmap_assert_locked(tlb->mm); split_huge_pud(vma, pud, addr); } else if (zap_huge_pud(tlb, vma, pud, addr)) goto next; /* fall through */ } if (pud_none_or_clear_bad(pud)) continue; next = zap_pmd_range(tlb, vma, pud, addr, next, details); next: cond_resched(); } while (pud++, addr = next, addr != end); return addr; } static inline unsigned long zap_p4d_range(struct mmu_gather *tlb, struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, struct zap_details *details) { p4d_t *p4d; unsigned long next; p4d = p4d_offset(pgd, addr); do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) continue; next = zap_pud_range(tlb, vma, p4d, addr, next, details); } while (p4d++, addr = next, addr != end); return addr; } void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) { pgd_t *pgd; unsigned long next; BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; next = zap_p4d_range(tlb, vma, pgd, addr, next, details); } while (pgd++, addr = next, addr != end); tlb_end_vma(tlb, vma); } static void unmap_single_vma(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { unsigned long start = max(vma->vm_start, start_addr); unsigned long end; if (start >= vma->vm_end) return; end = min(vma->vm_end, end_addr); if (end <= vma->vm_start) return; if (vma->vm_file) uprobe_munmap(vma, start, end); if (unlikely(vma->vm_flags & VM_PFNMAP)) untrack_pfn(vma, 0, 0); if (start != end) { if (unlikely(is_vm_hugetlb_page(vma))) { /* * It is undesirable to test vma->vm_file as it * should be non-null for valid hugetlb area. * However, vm_file will be NULL in the error * cleanup path of mmap_region. When * hugetlbfs ->mmap method fails, * mmap_region() nullifies vma->vm_file * before calling this function to clean up. * Since no pte has actually been setup, it is * safe to do nothing in this case. */ if (vma->vm_file) { i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL); i_mmap_unlock_write(vma->vm_file->f_mapping); } } else unmap_page_range(tlb, vma, start, end, details); } } /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping * * Unmap all pages in the vma list. * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. * * unmap_vmas() assumes that the caller will flush the whole unmapped address * range after unmap_vmas() returns. So the only responsibility here is to * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { struct mmu_notifier_range range; mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) unmap_single_vma(tlb, vma, start_addr, end_addr, NULL); mmu_notifier_invalidate_range_end(&range); } /** * zap_page_range - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @start: starting address of pages to zap * @size: number of bytes to zap * * Caller must protect the VMA list */ void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { struct mmu_notifier_range range; struct mmu_gather tlb; lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, start, start + size); tlb_gather_mmu(&tlb, vma->vm_mm, start, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) unmap_single_vma(&tlb, vma, start, range.end, NULL); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, start, range.end); } /** * zap_page_range_single - remove user pages in a given range * @vma: vm_area_struct holding the applicable pages * @address: starting address of pages to zap * @size: number of bytes to zap * @details: details of shared cache invalidation * * The range must fit into one VMA. */ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mmu_notifier_range range; struct mmu_gather tlb; lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, address, address + size); tlb_gather_mmu(&tlb, vma->vm_mm, address, range.end); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); unmap_single_vma(&tlb, vma, address, range.end, details); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb, address, range.end); } /** * zap_vma_ptes - remove ptes mapping the vma * @vma: vm_area_struct holding ptes to be zapped * @address: starting address of pages to zap * @size: number of bytes to zap * * This function only unmaps ptes assigned to VM_PFNMAP vmas. * * The entire address range must be fully contained within the vma. * */ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size) { if (address < vma->vm_start || address + size > vma->vm_end || !(vma->vm_flags & VM_PFNMAP)) return; zap_page_range_single(vma, address, size, NULL); } EXPORT_SYMBOL_GPL(zap_vma_ptes); static pmd_t *walk_to_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd; pgd = pgd_offset(mm, addr); p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return NULL; pud = pud_alloc(mm, p4d, addr); if (!pud) return NULL; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return NULL; VM_BUG_ON(pmd_trans_huge(*pmd)); return pmd; } pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pmd_t *pmd = walk_to_pmd(mm, addr); if (!pmd) return NULL; return pte_alloc_map_lock(mm, pmd, addr, ptl); } static int validate_page_before_insert(struct page *page) { if (PageAnon(page) || PageSlab(page) || page_has_type(page)) return -EINVAL; flush_dcache_page(page); return 0; } static int insert_page_into_pte_locked(struct mm_struct *mm, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { if (!pte_none(*pte)) return -EBUSY; /* Ok, finally just insert the thing.. */ get_page(page); inc_mm_counter_fast(mm, mm_counter_file(page)); page_add_file_rmap(page, false); set_pte_at(mm, addr, pte, mk_pte(page, prot)); return 0; } /* * This is the old fallback for page remapping. * * For historical reasons, it only allows reserved pages. Only * old drivers should use this, and they needed to mark their * pages reserved for the old functions anyway. */ static int insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page, pgprot_t prot) { struct mm_struct *mm = vma->vm_mm; int retval; pte_t *pte; spinlock_t *ptl; retval = validate_page_before_insert(page); if (retval) goto out; retval = -ENOMEM; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; retval = insert_page_into_pte_locked(mm, pte, addr, page, prot); pte_unmap_unlock(pte, ptl); out: return retval; } #ifdef pte_index static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte, unsigned long addr, struct page *page, pgprot_t prot) { int err; if (!page_count(page)) return -EINVAL; err = validate_page_before_insert(page); if (err) return err; return insert_page_into_pte_locked(mm, pte, addr, page, prot); } /* insert_pages() amortizes the cost of spinlock operations * when inserting pages in a loop. Arch *must* define pte_index. */ static int insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num, pgprot_t prot) { pmd_t *pmd = NULL; pte_t *start_pte, *pte; spinlock_t *pte_lock; struct mm_struct *const mm = vma->vm_mm; unsigned long curr_page_idx = 0; unsigned long remaining_pages_total = *num; unsigned long pages_to_write_in_pmd; int ret; more: ret = -EFAULT; pmd = walk_to_pmd(mm, addr); if (!pmd) goto out; pages_to_write_in_pmd = min_t(unsigned long, remaining_pages_total, PTRS_PER_PTE - pte_index(addr)); /* Allocate the PTE if necessary; takes PMD lock once only. */ ret = -ENOMEM; if (pte_alloc(mm, pmd)) goto out; while (pages_to_write_in_pmd) { int pte_idx = 0; const int batch_size = min_t(int, pages_to_write_in_pmd, 8); start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock); for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) { int err = insert_page_in_batch_locked(mm, pte, addr, pages[curr_page_idx], prot); if (unlikely(err)) { pte_unmap_unlock(start_pte, pte_lock); ret = err; remaining_pages_total -= pte_idx; goto out; } addr += PAGE_SIZE; ++curr_page_idx; } pte_unmap_unlock(start_pte, pte_lock); pages_to_write_in_pmd -= batch_size; remaining_pages_total -= batch_size; } if (remaining_pages_total) goto more; ret = 0; out: *num = remaining_pages_total; return ret; } #endif /* ifdef pte_index */ /** * vm_insert_pages - insert multiple pages into user vma, batching the pmd lock. * @vma: user vma to map to * @addr: target start user address of these pages * @pages: source kernel pages * @num: in: number of pages to map. out: number of pages that were *not* * mapped. (0 means all pages were successfully mapped). * * Preferred over vm_insert_page() when inserting multiple pages. * * In case of error, we may have mapped a subset of the provided * pages. It is the caller's responsibility to account for this case. * * The same restrictions apply as in vm_insert_page(). */ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr, struct page **pages, unsigned long *num) { #ifdef pte_index const unsigned long end_addr = addr + (*num * PAGE_SIZE) - 1; if (addr < vma->vm_start || end_addr >= vma->vm_end) return -EFAULT; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } /* Defer page refcount checking till we're about to map that page. */ return insert_pages(vma, addr, pages, num, vma->vm_page_prot); #else unsigned long idx = 0, pgcount = *num; int err = -EINVAL; for (; idx < pgcount; ++idx) { err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]); if (err) break; } *num = pgcount - idx; return err; #endif /* ifdef pte_index */ } EXPORT_SYMBOL(vm_insert_pages); /** * vm_insert_page - insert single page into user vma * @vma: user vma to map to * @addr: target user address of this page * @page: source kernel page * * This allows drivers to insert individual pages they've allocated * into a user vma. * * The page has to be a nice clean _individual_ kernel allocation. * If you allocate a compound page, you need to have marked it as * such (__GFP_COMP), or manually just split the page up yourself * (see split_page()). * * NOTE! Traditionally this was done with "remap_pfn_range()" which * took an arbitrary page protection parameter. This doesn't allow * that. Your vma protection will have to be set up correctly, which * means that if you want a shared writable mapping, you'd better * ask for a shared writable mapping! * * The page does not need to be reserved. * * Usually this function is called from f_op->mmap() handler * under mm->mmap_lock write-lock, so it can change vma->vm_flags. * Caller must set VM_MIXEDMAP on vma if it wants to call this * function from other places, for example from page-fault handler. * * Return: %0 on success, negative error code otherwise. */ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; if (!page_count(page)) return -EINVAL; if (!(vma->vm_flags & VM_MIXEDMAP)) { BUG_ON(mmap_read_trylock(vma->vm_mm)); BUG_ON(vma->vm_flags & VM_PFNMAP); vma->vm_flags |= VM_MIXEDMAP; } return insert_page(vma, addr, page, vma->vm_page_prot); } EXPORT_SYMBOL(vm_insert_page); /* * __vm_map_pages - maps range of kernel pages into user vma * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * @offset: user's requested vm_pgoff * * This allows drivers to map range of kernel pages into a user vma. * * Return: 0 on success and error code otherwise. */ static int __vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num, unsigned long offset) { unsigned long count = vma_pages(vma); unsigned long uaddr = vma->vm_start; int ret, i; /* Fail if the user requested offset is beyond the end of the object */ if (offset >= num) return -ENXIO; /* Fail if the user requested size exceeds available object size */ if (count > num - offset) return -ENXIO; for (i = 0; i < count; i++) { ret = vm_insert_page(vma, uaddr, pages[offset + i]); if (ret < 0) return ret; uaddr += PAGE_SIZE; } return 0; } /** * vm_map_pages - maps range of kernel pages starts with non zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Maps an object consisting of @num pages, catering for the user's * requested vm_pgoff * * If we fail to insert any page into the vma, the function will return * immediately leaving any previously inserted pages present. Callers * from the mmap handler may immediately return the error as their caller * will destroy the vma, removing any successfully inserted pages. Other * callers should make their own arrangements for calling unmap_region(). * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, vma->vm_pgoff); } EXPORT_SYMBOL(vm_map_pages); /** * vm_map_pages_zero - map range of kernel pages starts with zero offset * @vma: user vma to map to * @pages: pointer to array of source kernel pages * @num: number of pages in page array * * Similar to vm_map_pages(), except that it explicitly sets the offset * to 0. This function is intended for the drivers that did not consider * vm_pgoff. * * Context: Process context. Called by mmap handlers. * Return: 0 on success and error code otherwise. */ int vm_map_pages_zero(struct vm_area_struct *vma, struct page **pages, unsigned long num) { return __vm_map_pages(vma, pages, num, 0); } EXPORT_SYMBOL(vm_map_pages_zero); static vm_fault_t insert_pfn(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t prot, bool mkwrite) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, entry; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) return VM_FAULT_OOM; if (!pte_none(*pte)) { if (mkwrite) { /* * For read faults on private mappings the PFN passed * in may not match the PFN we have mapped if the * mapped PFN is a writeable COW page. In the mkwrite * case we are creating a writable PTE for a shared * mapping and we expect the PFNs to match. If they * don't match, we are likely racing with block * allocation and mapping invalidation so just skip the * update. */ if (pte_pfn(*pte) != pfn_t_to_pfn(pfn)) { WARN_ON_ONCE(!is_zero_pfn(pte_pfn(*pte))); goto out_unlock; } entry = pte_mkyoung(*pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, addr, pte, entry, 1)) update_mmu_cache(vma, addr, pte); } goto out_unlock; } /* Ok, finally just insert the thing.. */ if (pfn_t_devmap(pfn)) entry = pte_mkdevmap(pfn_t_pte(pfn, prot)); else entry = pte_mkspecial(pfn_t_pte(pfn, prot)); if (mkwrite) { entry = pte_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); } set_pte_at(mm, addr, pte, entry); update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ out_unlock: pte_unmap_unlock(pte, ptl); return VM_FAULT_NOPAGE; } /** * vmf_insert_pfn_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * * This is exactly like vmf_insert_pfn(), except that it allows drivers * to override pgprot on a per-page basis. * * This only makes sense for IO mappings, and it makes no sense for * COW mappings. In general, using multiple vmas is preferable; * vmf_insert_pfn_prot should only be used if using multiple VMAs is * impractical. * * See vmf_insert_mixed_prot() for a discussion of the implication of using * a value of @pgprot different from that of @vma->vm_page_prot. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot) { /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like * consistency in testing and feature parity among all, so we should * try to keep these invariants in place for everybody. */ BUG_ON(!(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))); BUG_ON((vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP)) == (VM_PFNMAP|VM_MIXEDMAP)); BUG_ON((vma->vm_flags & VM_PFNMAP) && is_cow_mapping(vma->vm_flags)); BUG_ON((vma->vm_flags & VM_MIXEDMAP) && pfn_valid(pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; if (!pfn_modify_allowed(pfn, pgprot)) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, __pfn_to_pfn_t(pfn, PFN_DEV)); return insert_pfn(vma, addr, __pfn_to_pfn_t(pfn, PFN_DEV), pgprot, false); } EXPORT_SYMBOL(vmf_insert_pfn_prot); /** * vmf_insert_pfn - insert single pfn into user vma * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * * Similar to vm_insert_page, this allows drivers to insert individual pages * they've allocated into a user vma. Same comments apply. * * This function should only be called from a vm_ops->fault handler, and * in that case the handler should return the result of this function. * * vma cannot be a COW mapping. * * As this is called only for pages that do not currently exist, we * do not need to flush old virtual caches or the TLB. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { return vmf_insert_pfn_prot(vma, addr, pfn, vma->vm_page_prot); } EXPORT_SYMBOL(vmf_insert_pfn); static bool vm_mixed_ok(struct vm_area_struct *vma, pfn_t pfn) { /* these checks mirror the abort conditions in vm_normal_page */ if (vma->vm_flags & VM_MIXEDMAP) return true; if (pfn_t_devmap(pfn)) return true; if (pfn_t_special(pfn)) return true; if (is_zero_pfn(pfn_t_to_pfn(pfn))) return true; return false; } static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t pgprot, bool mkwrite) { int err; BUG_ON(!vm_mixed_ok(vma, pfn)); if (addr < vma->vm_start || addr >= vma->vm_end) return VM_FAULT_SIGBUS; track_pfn_insert(vma, &pgprot, pfn); if (!pfn_modify_allowed(pfn_t_to_pfn(pfn), pgprot)) return VM_FAULT_SIGBUS; /* * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* * At this point we are committed to insert_page() * regardless of whether the caller specified flags that * result in pfn_t_has_page() == false. */ page = pfn_to_page(pfn_t_to_pfn(pfn)); err = insert_page(vma, addr, page, pgprot); } else { return insert_pfn(vma, addr, pfn, pgprot, mkwrite); } if (err == -ENOMEM) return VM_FAULT_OOM; if (err < 0 && err != -EBUSY) return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } /** * vmf_insert_mixed_prot - insert single pfn into user vma with specified pgprot * @vma: user vma to map to * @addr: target user address of this page * @pfn: source kernel pfn * @pgprot: pgprot flags for the inserted page * * This is exactly like vmf_insert_mixed(), except that it allows drivers * to override pgprot on a per-page basis. * * Typically this function should be used by drivers to set caching- and * encryption bits different than those of @vma->vm_page_prot, because * the caching- or encryption mode may not be known at mmap() time. * This is ok as long as @vma->vm_page_prot is not used by the core vm * to set caching and encryption bits for those vmas (except for COW pages). * This is ensured by core vm only modifying these page table entries using * functions that don't touch caching- or encryption bits, using pte_modify() * if needed. (See for example mprotect()). * Also when new page-table entries are created, this is only done using the * fault() callback, and never using the value of vma->vm_page_prot, * except for page-table entries that point to anonymous pages as the result * of COW. * * Context: Process context. May allocate using %GFP_KERNEL. * Return: vm_fault_t value. */ vm_fault_t vmf_insert_mixed_prot(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn, pgprot_t pgprot) { return __vm_insert_mixed(vma, addr, pfn, pgprot, false); } EXPORT_SYMBOL(vmf_insert_mixed_prot); vm_fault_t vmf_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, false); } EXPORT_SYMBOL(vmf_insert_mixed); /* * If the insertion of PTE failed because someone else already added a * different entry in the mean time, we treat that as success as we assume * the same entry was actually inserted. */ vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn) { return __vm_insert_mixed(vma, addr, pfn, vma->vm_page_prot, true); } EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old * mappings are removed. any references to nonexistent pages results * in null mappings (currently treated as "copy-on-access") */ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pte_t *pte, *mapped_pte; spinlock_t *ptl; int err = 0; mapped_pte = pte = pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; arch_enter_lazy_mmu_mode(); do { BUG_ON(!pte_none(*pte)); if (!pfn_modify_allowed(pfn, prot)) { err = -EACCES; break; } set_pte_at(mm, addr, pte, pte_mkspecial(pfn_pte(pfn, prot))); pfn++; } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(mapped_pte, ptl); return err; } static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pmd_t *pmd; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); err = remap_pte_range(mm, pmd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pmd++, addr = next, addr != end); return 0; } static inline int remap_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { pud_t *pud; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; pud = pud_alloc(mm, p4d, addr); if (!pud) return -ENOMEM; do { next = pud_addr_end(addr, end); err = remap_pmd_range(mm, pud, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (pud++, addr = next, addr != end); return 0; } static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, unsigned long pfn, pgprot_t prot) { p4d_t *p4d; unsigned long next; int err; pfn -= addr >> PAGE_SHIFT; p4d = p4d_alloc(mm, pgd, addr); if (!p4d) return -ENOMEM; do { next = p4d_addr_end(addr, end); err = remap_pud_range(mm, p4d, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) return err; } while (p4d++, addr = next, addr != end); return 0; } /** * remap_pfn_range - remap kernel memory to userspace * @vma: user vma to map to * @addr: target page aligned user address to start at * @pfn: page frame number of kernel physical memory address * @size: size of mapping area * @prot: page protection flags for this mapping * * Note: this is only safe if the mm semaphore is held when called. * * Return: %0 on success, negative error code otherwise. */ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, unsigned long size, pgprot_t prot) { pgd_t *pgd; unsigned long next; unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; unsigned long remap_pfn = pfn; int err; if (WARN_ON_ONCE(!PAGE_ALIGNED(addr))) return -EINVAL; /* * Physically remapped pages are special. Tell the * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). * VM_PFNMAP tells the core MM that the base pages are just * raw PFN mappings, and do not have a "struct page" associated * with them. * VM_DONTEXPAND * Disable vma merging and expanding with mremap(). * VM_DONTDUMP * Omit vma from core dump, even when VM_IO turned off. * * There's a horrible special case to handle copy-on-write * behaviour that some programs depend on. We mark the "original" * un-COW'ed pages by matching them up with "vma->vm_pgoff". * See vm_normal_page() for details. */ if (is_cow_mapping(vma->vm_flags)) { if (addr != vma->vm_start || end != vma->vm_end) return -EINVAL; vma->vm_pgoff = pfn; } err = track_pfn_remap(vma, &prot, remap_pfn, addr, PAGE_ALIGN(size)); if (err) return -EINVAL; vma->vm_flags |= VM_IO | VM_PFNMAP | VM_DONTEXPAND | VM_DONTDUMP; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; pgd = pgd_offset(mm, addr); flush_cache_range(vma, addr, end); do { next = pgd_addr_end(addr, end); err = remap_p4d_range(mm, pgd, addr, next, pfn + (addr >> PAGE_SHIFT), prot); if (err) break; } while (pgd++, addr = next, addr != end); if (err) untrack_pfn(vma, remap_pfn, PAGE_ALIGN(size)); return err; } EXPORT_SYMBOL(remap_pfn_range); /** * vm_iomap_memory - remap memory to userspace * @vma: user vma to map to * @start: start of the physical memory to be mapped * @len: size of area * * This is a simplified io_remap_pfn_range() for common driver use. The * driver just needs to give us the physical memory range to be mapped, * we'll figure out the rest from the vma information. * * NOTE! Some drivers might want to tweak vma->vm_page_prot first to get * whatever write-combining details or similar. * * Return: %0 on success, negative error code otherwise. */ int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len) { unsigned long vm_len, pfn, pages; /* Check that the physical memory area passed in looks valid */ if (start + len < start) return -EINVAL; /* * You *really* shouldn't map things that aren't page-aligned, * but we've historically allowed it because IO memory might * just have smaller alignment. */ len += start & ~PAGE_MASK; pfn = start >> PAGE_SHIFT; pages = (len + ~PAGE_MASK) >> PAGE_SHIFT; if (pfn + pages < pfn) return -EINVAL; /* We start the mapping 'vm_pgoff' pages into the area */ if (vma->vm_pgoff > pages) return -EINVAL; pfn += vma->vm_pgoff; pages -= vma->vm_pgoff; /* Can we fit all of the mapping? */ vm_len = vma->vm_end - vma->vm_start; if (vm_len >> PAGE_SHIFT > pages) return -EINVAL; /* Ok, let it rip */ return io_remap_pfn_range(vma, vma->vm_start, pfn, vm_len, vma->vm_page_prot); } EXPORT_SYMBOL(vm_iomap_memory); static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pte_t *pte; int err = 0; spinlock_t *ptl; if (create) { pte = (mm == &init_mm) ? pte_alloc_kernel_track(pmd, addr, mask) : pte_alloc_map_lock(mm, pmd, addr, &ptl); if (!pte) return -ENOMEM; } else { pte = (mm == &init_mm) ? pte_offset_kernel(pmd, addr) : pte_offset_map_lock(mm, pmd, addr, &ptl); } BUG_ON(pmd_huge(*pmd)); arch_enter_lazy_mmu_mode(); if (fn) { do { if (create || !pte_none(*pte)) { err = fn(pte++, addr, data); if (err) break; } } while (addr += PAGE_SIZE, addr != end); } *mask |= PGTBL_PTE_MODIFIED; arch_leave_lazy_mmu_mode(); if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); return err; } static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pmd_t *pmd; unsigned long next; int err = 0; BUG_ON(pud_huge(*pud)); if (create) { pmd = pmd_alloc_track(mm, pud, addr, mask); if (!pmd) return -ENOMEM; } else { pmd = pmd_offset(pud, addr); } do { next = pmd_addr_end(addr, end); if (create || !pmd_none_or_clear_bad(pmd)) { err = apply_to_pte_range(mm, pmd, addr, next, fn, data, create, mask); if (err) break; } } while (pmd++, addr = next, addr != end); return err; } static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { pud_t *pud; unsigned long next; int err = 0; if (create) { pud = pud_alloc_track(mm, p4d, addr, mask); if (!pud) return -ENOMEM; } else { pud = pud_offset(p4d, addr); } do { next = pud_addr_end(addr, end); if (create || !pud_none_or_clear_bad(pud)) { err = apply_to_pmd_range(mm, pud, addr, next, fn, data, create, mask); if (err) break; } } while (pud++, addr = next, addr != end); return err; } static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd, unsigned long addr, unsigned long end, pte_fn_t fn, void *data, bool create, pgtbl_mod_mask *mask) { p4d_t *p4d; unsigned long next; int err = 0; if (create) { p4d = p4d_alloc_track(mm, pgd, addr, mask); if (!p4d) return -ENOMEM; } else { p4d = p4d_offset(pgd, addr); } do { next = p4d_addr_end(addr, end); if (create || !p4d_none_or_clear_bad(p4d)) { err = apply_to_pud_range(mm, p4d, addr, next, fn, data, create, mask); if (err) break; } } while (p4d++, addr = next, addr != end); return err; } static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data, bool create) { pgd_t *pgd; unsigned long start = addr, next; unsigned long end = addr + size; pgtbl_mod_mask mask = 0; int err = 0; if (WARN_ON(addr >= end)) return -EINVAL; pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); if (!create && pgd_none_or_clear_bad(pgd)) continue; err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask); if (err) break; } while (pgd++, addr = next, addr != end); if (mask & ARCH_PAGE_TABLE_SYNC_MASK) arch_sync_kernel_mappings(start, start + size); return err; } /* * Scan a region of virtual memory, filling in page tables as necessary * and calling a provided function on each leaf page table. */ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, true); } EXPORT_SYMBOL_GPL(apply_to_page_range); /* * Scan a region of virtual memory, calling a provided function on * each leaf page table where it exists. * * Unlike apply_to_page_range, this does _not_ fill in page tables * where they are absent. */ int apply_to_existing_page_range(struct mm_struct *mm, unsigned long addr, unsigned long size, pte_fn_t fn, void *data) { return __apply_to_page_range(mm, addr, size, fn, data, false); } EXPORT_SYMBOL_GPL(apply_to_existing_page_range); /* * handle_pte_fault chooses page fault handler according to an entry which was * read non-atomically. Before making any commitment, on those architectures * or configurations (e.g. i386 with PAE) which might give a mix of unmatched * parts, do_swap_page must check under lock before unmapping the pte and * proceeding (but do_wp_page is only called after already making such a check; * and do_anonymous_page can safely check later on). */ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, pte_t *page_table, pte_t orig_pte) { int same = 1; #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPTION) if (sizeof(pte_t) > sizeof(unsigned long)) { spinlock_t *ptl = pte_lockptr(mm, pmd); spin_lock(ptl); same = pte_same(*page_table, orig_pte); spin_unlock(ptl); } #endif pte_unmap(page_table); return same; } static inline bool cow_user_page(struct page *dst, struct page *src, struct vm_fault *vmf) { bool ret; void *kaddr; void __user *uaddr; bool locked = false; struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; unsigned long addr = vmf->address; if (likely(src)) { copy_user_highpage(dst, src, addr, vma); return true; } /* * If the source page was a PFN mapping, we don't have * a "struct page" for it. We do a best-effort copy by * just copying from the original user address. If that * fails, we just zero-fill it. Live with it. */ kaddr = kmap_atomic(dst); uaddr = (void __user *)(addr & PAGE_MASK); /* * On architectures with software "accessed" bits, we would * take a double page fault, so mark it accessed here. */ if (arch_faults_on_old_pte() && !pte_young(vmf->orig_pte)) { pte_t entry; vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* * Other thread has already handled the fault * and update local tlb only */ update_mmu_tlb(vma, addr, vmf->pte); ret = false; goto pte_unlock; } entry = pte_mkyoung(vmf->orig_pte); if (ptep_set_access_flags(vma, addr, vmf->pte, entry, 0)) update_mmu_cache(vma, addr, vmf->pte); } /* * This really shouldn't fail, because the page is there * in the page tables. But it might just be unreadable, * in which case we just give up and fill the result with * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { if (locked) goto warn; /* Re-validate under PTL if the page is still mapped */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl); locked = true; if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) { /* The PTE changed under us, update local tlb */ update_mmu_tlb(vma, addr, vmf->pte); ret = false; goto pte_unlock; } /* * The same page can be mapped back since last copy attempt. * Try to copy again under PTL. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* * Give a warn in case there can be some obscure * use-case */ warn: WARN_ON_ONCE(1); clear_page(kaddr); } } ret = true; pte_unlock: if (locked) pte_unmap_unlock(vmf->pte, vmf->ptl); kunmap_atomic(kaddr); flush_dcache_page(dst); return ret; } static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) { struct file *vm_file = vma->vm_file; if (vm_file) return mapping_gfp_mask(vm_file->f_mapping) | __GFP_FS | __GFP_IO; /* * Special mappings (e.g. VDSO) do not have any file so fake * a default GFP_KERNEL for them. */ return GFP_KERNEL; } /* * Notify the address space that the page is about to become writable so that * it can prohibit this or wait for the page to get into an appropriate state. * * We do this without the lock held, so that it can sleep if it needs to. */ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) { vm_fault_t ret; struct page *page = vmf->page; unsigned int old_flags = vmf->flags; vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; if (vmf->vma->vm_file && IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) return VM_FAULT_SIGBUS; ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; if (unlikely(!(ret & VM_FAULT_LOCKED))) { lock_page(page); if (!page->mapping) { unlock_page(page); return 0; /* retry */ } ret |= VM_FAULT_LOCKED; } else VM_BUG_ON_PAGE(!PageLocked(page), page); return ret; } /* * Handle dirtying of a page in shared file mapping on a write fault. * * The function expects the page to be locked and unlocks it. */ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct address_space *mapping; struct page *page = vmf->page; bool dirtied; bool page_mkwrite = vma->vm_ops && vma->vm_ops->page_mkwrite; dirtied = set_page_dirty(page); VM_BUG_ON_PAGE(PageAnon(page), page); /* * Take a local copy of the address_space - page.mapping may be zeroed * by truncate after unlock_page(). The address_space itself remains * pinned by vma->vm_file's reference. We rely on unlock_page()'s * release semantics to prevent the compiler from undoing this copying. */ mapping = page_rmapping(page); unlock_page(page); if (!page_mkwrite) file_update_time(vma->vm_file); /* * Throttle page dirtying rate down to writeback speed. * * mapping may be NULL here because some device drivers do not * set page.mapping but still dirty their pages * * Drop the mmap_lock before waiting on IO, if we can. The file * is pinning the mapping, as per above. */ if ((dirtied || page_mkwrite) && mapping) { struct file *fpin; fpin = maybe_unlock_mmap_for_io(vmf, NULL); balance_dirty_pages_ratelimited(mapping); if (fpin) { fput(fpin); return VM_FAULT_RETRY; } } return 0; } /* * Handle write page faults for pages that can be reused in the current vma * * This can happen either due to the mapping being with the VM_SHARED flag, * or due to us being the last reference standing to the page. In either * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ static inline void wp_page_reuse(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; struct page *page = vmf->page; pte_t entry; /* * Clear the pages cpupid information as the existing * information potentially belongs to a now completely * unrelated process. */ if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); count_vm_event(PGREUSE); } /* * Handle the case of a page which we actually need to copy to a new page. * * Called with mmap_lock locked and the old page referenced, but * without the ptl held. * * High level logic flow: * * - Allocate a page, copy the content of the old page to the new one. * - Handle book keeping and accounting - cgroups, mmu-notifiers, etc. * - Take the PTL. If the pte changed, bail out and release the allocated page * - If the pte is still the way we remember it, update the page table and all * relevant references. This includes dropping the reference the page-table * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ static vm_fault_t wp_page_copy(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct page *old_page = vmf->page; struct page *new_page = NULL; pte_t entry; int page_copied = 0; struct mmu_notifier_range range; if (unlikely(anon_vma_prepare(vma))) goto oom; if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) goto oom; if (!cow_user_page(new_page, old_page, vmf)) { /* * COW failed, if the fault was solved by other, * it's fine. If not, userspace would re-fault on * the same address and we will handle the fault * from the second attempt. */ put_page(new_page); if (old_page) put_page(old_page); return 0; } } if (mem_cgroup_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; cgroup_throttle_swaprate(new_page, GFP_KERNEL); __SetPageUptodate(new_page); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, vmf->address & PAGE_MASK, (vmf->address & PAGE_MASK) + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); /* * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, mm_counter_file(old_page)); inc_mm_counter_fast(mm, MM_ANONPAGES); } } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* * Clear the pte entry and flush it first, before updating the * pte with the new entry. This will avoid a race condition * seen in the presence of one thread doing SMC and another * thread doing COW. */ ptep_clear_flush_notify(vma, vmf->address, vmf->pte); page_add_new_anon_rmap(new_page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(new_page, vma); /* * We call the notify macro here because, when using secondary * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ set_pte_at_notify(mm, vmf->address, vmf->pte, entry); update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may * we remove the mapcount here. Otherwise another * process may come and find the rmap count decremented * before the pte is switched to the new page, and * "reuse" the old page writing into it while our pte * here still points into it and can be read by other * threads. * * The critical issue is to order this * page_remove_rmap with the ptp_clear_flush above. * Those stores are ordered by (if nothing else,) * the barrier present in the atomic_add_negative * in page_remove_rmap. * * Then the TLB flush in ptep_clear_flush ensures that * no process can access the old page before the * decremented mapcount is visible. And the old page * cannot be reused until after the decremented * mapcount is visible. So transitively, TLBs to * old page will be flushed before it can be reused. */ page_remove_rmap(old_page, false); } /* Free the old page.. */ new_page = old_page; page_copied = 1; } else { update_mmu_tlb(vma, vmf->address, vmf->pte); } if (new_page) put_page(new_page); pte_unmap_unlock(vmf->pte, vmf->ptl); /* * No need to double call mmu_notifier->invalidate_range() callback as * the above ptep_clear_flush_notify() did already call it. */ mmu_notifier_invalidate_range_only_end(&range); if (old_page) { /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. */ if (page_copied && (vma->vm_flags & VM_LOCKED)) { lock_page(old_page); /* LRU manipulation */ if (PageMlocked(old_page)) munlock_vma_page(old_page); unlock_page(old_page); } put_page(old_page); } return page_copied ? VM_FAULT_WRITE : 0; oom_free_new: put_page(new_page); oom: if (old_page) put_page(old_page); return VM_FAULT_OOM; } /** * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE * writeable once the page is prepared * * @vmf: structure describing the fault * * This function handles all that is needed to finish a write page fault in a * shared mapping due to PTE being read-only once the mapped page is prepared. * It handles locking of PTE and modifying it. * * The function expects the page to be locked or other protection against * concurrent faults / writeback (such as DAX radix tree locks). * * Return: %VM_FAULT_WRITE on success, %0 when PTE got changed before * we acquired PTE lock. */ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf) { WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); /* * We might have raced with another page fault while we released the * pte_offset_map_lock. */ if (!pte_same(*vmf->pte, vmf->orig_pte)) { update_mmu_tlb(vmf->vma, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); return VM_FAULT_NOPAGE; } wp_page_reuse(vmf); return 0; } /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ static vm_fault_t wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { vm_fault_t ret; pte_unmap_unlock(vmf->pte, vmf->ptl); vmf->flags |= FAULT_FLAG_MKWRITE; ret = vma->vm_ops->pfn_mkwrite(vmf); if (ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)) return ret; return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); return VM_FAULT_WRITE; } static vm_fault_t wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = VM_FAULT_WRITE; get_page(vmf->page); if (vma->vm_ops && vma->vm_ops->page_mkwrite) { vm_fault_t tmp; pte_unmap_unlock(vmf->pte, vmf->ptl); tmp = do_page_mkwrite(vmf); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(vmf->page); return tmp; } tmp = finish_mkwrite_fault(vmf); if (unlikely(tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) { unlock_page(vmf->page); put_page(vmf->page); return tmp; } } else { wp_page_reuse(vmf); lock_page(vmf->page); } ret |= fault_dirty_shared_page(vmf); put_page(vmf->page); return ret; } /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address * and decrementing the shared-page counter for the old page. * * Note that this routine assumes that the protection checks have been * done by the caller (the low-level page fault routine in most cases). * Thus we can safely just mark it writable once we've done any necessary * COW. * * We also mark the page dirty at this point even though the page will * change only once the write actually happens. This avoids a few races, * and potentially makes it more efficient. * * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; if (userfaultfd_pte_wp(vma, *vmf->pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_WP); } /* * Userfaultfd write-protect can defer flushes. Ensure the TLB * is flushed in this case before copying. */ if (unlikely(userfaultfd_wp(vmf->vma) && mm_tlb_flush_pending(vmf->vma->vm_mm))) flush_tlb_page(vmf->vma, vmf->address); vmf->page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!vmf->page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a * VM_PFNMAP VMA. * * We should not cow pages in a shared writeable mapping. * Just mark the pages writable and/or call ops->pfn_mkwrite. */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) return wp_pfn_shared(vmf); pte_unmap_unlock(vmf->pte, vmf->ptl); return wp_page_copy(vmf); } /* * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ if (PageAnon(vmf->page)) { struct page *page = vmf->page; /* PageKsm() doesn't necessarily raise the page refcount */ if (PageKsm(page) || page_count(page) != 1) goto copy; if (!trylock_page(page)) goto copy; if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) { unlock_page(page); goto copy; } /* * Ok, we've got the only map reference, and the only * page count reference, and the page is locked, * it's dark out, and we're wearing sunglasses. Hit it. */ unlock_page(page); wp_page_reuse(vmf); return VM_FAULT_WRITE; } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { return wp_page_shared(vmf); } copy: /* * Ok, we need to copy. Oh, well.. */ get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); return wp_page_copy(vmf); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, struct zap_details *details) { zap_page_range_single(vma, start_addr, end_addr - start_addr, details); } static inline void unmap_mapping_range_tree(struct rb_root_cached *root, struct zap_details *details) { struct vm_area_struct *vma; pgoff_t vba, vea, zba, zea; vma_interval_tree_foreach(vma, root, details->first_index, details->last_index) { vba = vma->vm_pgoff; vea = vba + vma_pages(vma) - 1; zba = details->first_index; if (zba < vba) zba = vba; zea = details->last_index; if (zea > vea) zea = vea; unmap_mapping_range_vma(vma, ((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start, details); } } /** * unmap_mapping_page() - Unmap single page from processes. * @page: The locked page to be unmapped. * * Unmap this page from any userspace process which still has it mmaped. * Typically, for efficiency, the range of nearby pages has already been * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once * truncation or invalidation holds the lock on a page, it may find that * the page has been remapped again: and then uses unmap_mapping_page() * to unmap it finally. */ void unmap_mapping_page(struct page *page) { struct address_space *mapping = page->mapping; struct zap_details details = { }; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageTail(page)); details.check_mapping = mapping; details.first_index = page->index; details.last_index = page->index + thp_nr_pages(page) - 1; details.single_page = page; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } /** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. * @nr: Number of pages to be unmapped. 0 to unmap to end of file. * @even_cows: Whether to unmap even private COWed pages. * * Unmap the pages in this address space from any userspace process which * has them mmaped. Generally, you want to remove COWed pages as well when * a file is being truncated, but not when invalidating pages from the page * cache. */ void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { struct zap_details details = { }; details.check_mapping = even_cows ? NULL : mapping; details.first_index = start; details.last_index = start + nr - 1; if (details.last_index < details.first_index) details.last_index = ULONG_MAX; i_mmap_lock_write(mapping); if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) unmap_mapping_range_tree(&mapping->i_mmap, &details); i_mmap_unlock_write(mapping); } /** * unmap_mapping_range - unmap the portion of all mmaps in the specified * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded * up to a PAGE_SIZE boundary. A holelen of zero truncates to the * end of the file. * @even_cows: 1 when truncating a file, unmap even private COWed pages; * but 0 when invalidating pagecache, don't throw away private data. */ void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; /* Check for overflow. */ if (sizeof(holelen) > sizeof(hlen)) { long long holeend = (holebegin + holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; if (holeend & ~(long long)ULONG_MAX) hlen = ULONG_MAX - hba + 1; } unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with pte unmapped and unlocked. * * We return with the mmap_lock locked or unlocked in the same cases * as does filemap_fault(). */ vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *swapcache; swp_entry_t entry; pte_t pte; int locked; int exclusive = 0; vm_fault_t ret = 0; void *shadow = NULL; if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, vmf->address); } else if (is_device_private_entry(entry)) { vmf->page = device_private_entry_to_page(entry); ret = vmf->page->pgmap->ops->migrate_to_ram(vmf); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; if (!page) { struct swap_info_struct *si = swp_swap_info(entry); if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (page) { int err; __SetPageLocked(page); __SetPageSwapBacked(page); set_page_private(page, entry.val); /* Tell memcg to use swap ownership records */ SetPageSwapCache(page); err = mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL); ClearPageSwapCache(page); if (err) { ret = VM_FAULT_OOM; goto out_page; } shadow = get_shadow_from_swap_cache(entry); if (shadow) workingset_refault(page, shadow); lru_cache_add(page); swap_readpage(page, true); } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = page; } if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; } /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); } else if (PageHWPoison(page)) { /* * hwpoisoned dirty swapcache pages are kept for killing * owner processes (which may be unknown at hwpoison time) */ ret = VM_FAULT_HWPOISON; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto out_release; } locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { ret |= VM_FAULT_RETRY; goto out_release; } /* * Make sure try_to_free_swap or reuse_swap_page or swapoff did not * release the swapcache from under us. The page pin, and pte_same * test below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not changed. */ if (unlikely((!PageSwapCache(page) || page_private(page) != entry.val)) && swapcache) goto out_page; page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; goto out_page; } cgroup_throttle_swaprate(page, GFP_KERNEL); /* * Back out if somebody else already faulted in this pte. */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } /* * The page isn't present yet, go ahead with the fault. * * Be careful about the sequence of operations here. * To get its accounting right, reuse_swap_page() must be called * while the page is counted on swap but not yet in mapcount i.e. * before page_add_anon_rmap() and swap_free(); try_to_free_swap() * must be called after the swap_free(), or it will never succeed. */ inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); if (pte_swp_uffd_wp(vmf->orig_pte)) { pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); } swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check * (to avoid false positives from pte_same). For * further safety release the lock after the swap_free * so that the swap count won't change under a * parallel locked swapcache. */ unlock_page(swapcache); put_page(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: put_page(page); if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); } return ret; } /* * We enter with non-exclusive mmap_lock (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_lock still held, but pte unmapped and unlocked. */ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page; vm_fault_t ret = 0; pte_t entry; /* File mapping without ->vm_ops ? */ if (vma->vm_flags & VM_SHARED) return VM_FAULT_SIGBUS; /* * Use pte_alloc() instead of pte_alloc_map(). We can't run * pte_offset_map() on pmds where a huge pmd might be created * from a different thread. * * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when * parallel threads are excluded by other means. * * Here we only have mmap_read_lock(mm). */ if (pte_alloc(vma->vm_mm, vmf->pmd)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { update_mmu_tlb(vma, vmf->address, vmf->pte); goto unlock; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL)) goto oom_free_page; cgroup_throttle_swaprate(page, GFP_KERNEL); /* * The memory barrier inside __SetPageUptodate makes sure that * preceding stores to the page contents become visible before * the set_pte_at() write. */ __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (!pte_none(*vmf->pte)) { update_mmu_cache(vma, vmf->address, vmf->pte); goto release; } ret = check_stable_address_space(vma->vm_mm); if (ret) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(page); return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); setpte: set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ update_mmu_cache(vma, vmf->address, vmf->pte); unlock: pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; release: put_page(page); goto unlock; oom_free_page: put_page(page); oom: return VM_FAULT_OOM; } /* * The mmap_lock must have been held on entry, and may have been * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ static vm_fault_t __do_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; /* * Preallocate pte before we take page_lock because this might lead to * deadlocks for memcg reclaim which waits for pages under writeback: * lock_page(A) * SetPageWriteback(A) * unlock_page(A) * lock_page(B) * lock_page(B) * pte_alloc_one * shrink_page_list * wait_on_page_writeback(A) * SetPageWriteback(B) * unlock_page(B) * # flush A, B to clear the writeback */ if (pmd_none(*vmf->pmd) && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } ret = vma->vm_ops->fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | VM_FAULT_DONE_COW))) return ret; if (unlikely(PageHWPoison(vmf->page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf->page); put_page(vmf->page); vmf->page = NULL; return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) lock_page(vmf->page); else VM_BUG_ON_PAGE(!PageLocked(vmf->page), vmf->page); return ret; } /* * The ordering of these checks is important for pmds with _PAGE_DEVMAP set. * If we check pmd_trans_unstable() first we will trip the bad_pmd() check * inside of pmd_none_or_trans_huge_or_clear_bad(). This will end up correctly * returning 1 but not before it spams dmesg with the pmd_clear_bad() output. */ static int pmd_devmap_trans_unstable(pmd_t *pmd) { return pmd_devmap(*pmd) || pmd_trans_unstable(pmd); } static vm_fault_t pte_alloc_one_map(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; if (!pmd_none(*vmf->pmd)) goto map_pte; if (vmf->prealloc_pte) { vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { spin_unlock(vmf->ptl); goto map_pte; } mm_inc_nr_ptes(vma->vm_mm); pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); spin_unlock(vmf->ptl); vmf->prealloc_pte = NULL; } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd))) { return VM_FAULT_OOM; } map_pte: /* * If a huge pmd materialized under us just retry later. Use * pmd_trans_unstable() via pmd_devmap_trans_unstable() instead of * pmd_trans_huge() to ensure the pmd didn't become pmd_trans_huge * under us and then back to pmd_none, as a result of MADV_DONTNEED * running immediately after a huge pmd fault in a different thread of * this mm, in turn leading to a misleading pmd_trans_huge() retval. * All we have to ensure is that it is a regular pmd that we can walk * with pte_offset_map() and we can do that through an atomic read in * C, which is what pmd_trans_unstable() provides. */ if (pmd_devmap_trans_unstable(vmf->pmd)) return VM_FAULT_NOPAGE; /* * At this point we know that our vmf->pmd points to a page of ptes * and it cannot become pmd_none(), pmd_devmap() or pmd_trans_huge() * for the duration of the fault. If a racing MADV_DONTNEED runs and * we zap the ptes pointed to by our vmf->pmd, the vmf->ptl will still * be valid and we will re-check to make sure the vmf->pte isn't * pte_none() under vmf->ptl protection when we return to * alloc_set_pte(). */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); return 0; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static void deposit_prealloc_pte(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ mm_inc_nr_ptes(vma->vm_mm); vmf->prealloc_pte = NULL; } static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i; vm_fault_t ret = VM_FAULT_FALLBACK; if (!transhuge_vma_suitable(vma, haddr)) return ret; page = compound_head(page); if (compound_order(page) != HPAGE_PMD_ORDER) return ret; /* * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { vmf->prealloc_pte = pte_alloc_one(vma->vm_mm); if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) flush_icache_page(vma, page + i); entry = mk_huge_pmd(page, vma->vm_page_prot); if (write) entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); add_mm_counter(vma->vm_mm, mm_counter_file(page), HPAGE_PMD_NR); page_add_file_rmap(page, true); /* * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) deposit_prealloc_pte(vmf); set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; count_vm_event(THP_FILE_MAPPED); out: spin_unlock(vmf->ptl); return ret; } #else static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; } #endif /** * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the function allocates page table or use pre-allocated. * * @vmf: fault environment * @page: page to map * * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. * * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page) { struct vm_area_struct *vma = vmf->vma; bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; vm_fault_t ret; if (pmd_none(*vmf->pmd) && PageTransCompound(page)) { ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) return ret; } if (!vmf->pte) { ret = pte_alloc_one_map(vmf); if (ret) return ret; } /* Re-check under ptl */ if (unlikely(!pte_none(*vmf->pte))) { update_mmu_tlb(vma, vmf->address, vmf->pte); return VM_FAULT_NOPAGE; } flush_icache_page(vma, page); entry = mk_pte(page, vma->vm_page_prot); entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, vmf->address, false); lru_cache_add_inactive_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, vmf->address, vmf->pte); return 0; } /** * finish_fault - finish page fault once we have prepared the page to fault * * @vmf: structure describing the fault * * This function handles all that is needed to finish a page fault once the * page to fault in is prepared. It handles locking of PTEs, inserts PTE for * given page, adds reverse page mapping, handles memcg charges and LRU * addition. * * The function expects the page to be locked and on success it consumes a * reference of a page being mapped (for the PTE which maps it). * * Return: %0 on success, %VM_FAULT_ code in case of error. */ vm_fault_t finish_fault(struct vm_fault *vmf) { struct page *page; vm_fault_t ret = 0; /* Did we COW the page? */ if ((vmf->flags & FAULT_FLAG_WRITE) && !(vmf->vma->vm_flags & VM_SHARED)) page = vmf->cow_page; else page = vmf->page; /* * check even for read faults because we might have lost our CoWed * page */ if (!(vmf->vma->vm_flags & VM_SHARED)) ret = check_stable_address_space(vmf->vma->vm_mm); if (!ret) ret = alloc_set_pte(vmf, page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); return ret; } static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) { *val = fault_around_bytes; return 0; } /* * fault_around_bytes must be rounded down to the nearest page order as it's * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; if (val > PAGE_SIZE) fault_around_bytes = rounddown_pow_of_two(val); else fault_around_bytes = PAGE_SIZE; /* rounddown_pow_of_two(0) is undefined */ return 0; } DEFINE_DEBUGFS_ATTRIBUTE(fault_around_bytes_fops, fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { debugfs_create_file_unsafe("fault_around_bytes", 0644, NULL, NULL, &fault_around_bytes_fops); return 0; } late_initcall(fault_around_debugfs); #endif /* * do_fault_around() tries to map few pages around the fault address. The hope * is that the pages will be needed soon and this will lower the number of * faults to handle. * * It uses vm_ops->map_pages() to map the pages, which skips the page if it's * not ready to be mapped: not up-to-date, locked, etc. * * This function is called with the page table lock taken. In the split ptlock * case the page table lock only protects only those entries which belong to * the page table corresponding to the fault address. * * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * * fault_around_bytes defines how many bytes we'll try to map. * do_fault_around() expects it to be set to a power of two less than or equal * to PTRS_PER_PTE. * * The virtual address of the area that we map is naturally aligned to * fault_around_bytes rounded down to the machine page size * (and therefore to page order). This way it's easier to guarantee * that we don't cross page table boundaries. */ static vm_fault_t do_fault_around(struct vm_fault *vmf) { unsigned long address = vmf->address, nr_pages, mask; pgoff_t start_pgoff = vmf->pgoff; pgoff_t end_pgoff; int off; vm_fault_t ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; vmf->address = max(address & mask, vmf->vma->vm_start); off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* * end_pgoff is either the end of the page table, the end of * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); if (pmd_none(*vmf->pmd)) { vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm); if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ if (!vmf->pte) goto out; /* check if the page fault is solved */ vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; pte_unmap_unlock(vmf->pte, vmf->ptl); out: vmf->address = address; vmf->pte = NULL; return ret; } static vm_fault_t do_read_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret = 0; /* * Let's call ->map_pages() first and use ->fault() as fallback * if page by the offset is not ready to be mapped (cold cache or * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { ret = do_fault_around(vmf); if (ret) return ret; } ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; ret |= finish_fault(vmf); unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) put_page(vmf->page); return ret; } static vm_fault_t do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret; if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; vmf->cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!vmf->cow_page) return VM_FAULT_OOM; if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL); ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (ret & VM_FAULT_DONE_COW) return ret; copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); ret |= finish_fault(vmf); unlock_page(vmf->page); put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; uncharge_out: put_page(vmf->cow_page); return ret; } static vm_fault_t do_shared_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; vm_fault_t ret, tmp; ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; /* * Check if the backing address space wants to know that the page is * about to become writable */