1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM task #if !defined(_TRACE_TASK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TASK_H #include <linux/tracepoint.h> TRACE_EVENT(task_newtask, TP_PROTO(struct task_struct *task, unsigned long clone_flags), TP_ARGS(task, clone_flags), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN) __field( unsigned long, clone_flags) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->clone_flags = clone_flags; __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s clone_flags=%lx oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->clone_flags, __entry->oom_score_adj) ); TRACE_EVENT(task_rename, TP_PROTO(struct task_struct *task, const char *comm), TP_ARGS(task, comm), TP_STRUCT__entry( __field( pid_t, pid) __array( char, oldcomm, TASK_COMM_LEN) __array( char, newcomm, TASK_COMM_LEN) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(entry->oldcomm, task->comm, TASK_COMM_LEN); strlcpy(entry->newcomm, comm, TASK_COMM_LEN); __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d oldcomm=%s newcomm=%s oom_score_adj=%hd", __entry->pid, __entry->oldcomm, __entry->newcomm, __entry->oom_score_adj) ); #endif /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGETLB_INLINE_H #define _LINUX_HUGETLB_INLINE_H #ifdef CONFIG_HUGETLB_PAGE #include <linux/mm.h> static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return !!(vma->vm_flags & VM_HUGETLB); } #else static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma) { return false; } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_SIGNAL_H #define _LINUX_SCHED_SIGNAL_H #include <linux/rculist.h> #include <linux/signal.h> #include <linux/sched.h> #include <linux/sched/jobctl.h> #include <linux/sched/task.h> #include <linux/cred.h> #include <linux/refcount.h> #include <linux/posix-timers.h> #include <linux/mm_types.h> #include <asm/ptrace.h> /* * Types defining task->signal and task->sighand and APIs using them: */ struct sighand_struct { spinlock_t siglock; refcount_t count; wait_queue_head_t signalfd_wqh; struct k_sigaction action[_NSIG]; }; /* * Per-process accounting stats: */ struct pacct_struct { int ac_flag; long ac_exitcode; unsigned long ac_mem; u64 ac_utime, ac_stime; unsigned long ac_minflt, ac_majflt; }; struct cpu_itimer { u64 expires; u64 incr; }; /* * This is the atomic variant of task_cputime, which can be used for * storing and updating task_cputime statistics without locking. */ struct task_cputime_atomic { atomic64_t utime; atomic64_t stime; atomic64_t sum_exec_runtime; }; #define INIT_CPUTIME_ATOMIC \ (struct task_cputime_atomic) { \ .utime = ATOMIC64_INIT(0), \ .stime = ATOMIC64_INIT(0), \ .sum_exec_runtime = ATOMIC64_INIT(0), \ } /** * struct thread_group_cputimer - thread group interval timer counts * @cputime_atomic: atomic thread group interval timers. * * This structure contains the version of task_cputime, above, that is * used for thread group CPU timer calculations. */ struct thread_group_cputimer { struct task_cputime_atomic cputime_atomic; }; struct multiprocess_signals { sigset_t signal; struct hlist_node node; }; /* * NOTE! "signal_struct" does not have its own * locking, because a shared signal_struct always * implies a shared sighand_struct, so locking * sighand_struct is always a proper superset of * the locking of signal_struct. */ struct signal_struct { refcount_t sigcnt; atomic_t live; int nr_threads; struct list_head thread_head; wait_queue_head_t wait_chldexit; /* for wait4() */ /* current thread group signal load-balancing target: */ struct task_struct *curr_target; /* shared signal handling: */ struct sigpending shared_pending; /* For collecting multiprocess signals during fork */ struct hlist_head multiprocess; /* thread group exit support */ int group_exit_code; /* overloaded: * - notify group_exit_task when ->count is equal to notify_count * - everyone except group_exit_task is stopped during signal delivery * of fatal signals, group_exit_task processes the signal. */ int notify_count; struct task_struct *group_exit_task; /* thread group stop support, overloads group_exit_code too */ int group_stop_count; unsigned int flags; /* see SIGNAL_* flags below */ /* * PR_SET_CHILD_SUBREAPER marks a process, like a service * manager, to re-parent orphan (double-forking) child processes * to this process instead of 'init'. The service manager is * able to receive SIGCHLD signals and is able to investigate * the process until it calls wait(). All children of this * process will inherit a flag if they should look for a * child_subreaper process at exit. */ unsigned int is_child_subreaper:1; unsigned int has_child_subreaper:1; #ifdef CONFIG_POSIX_TIMERS /* POSIX.1b Interval Timers */ int posix_timer_id; struct list_head posix_timers; /* ITIMER_REAL timer for the process */ struct hrtimer real_timer; ktime_t it_real_incr; /* * ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use * CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these * values are defined to 0 and 1 respectively */ struct cpu_itimer it[2]; /* * Thread group totals for process CPU timers. * See thread_group_cputimer(), et al, for details. */ struct thread_group_cputimer cputimer; #endif /* Empty if CONFIG_POSIX_TIMERS=n */ struct posix_cputimers posix_cputimers; /* PID/PID hash table linkage. */ struct pid *pids[PIDTYPE_MAX]; #ifdef CONFIG_NO_HZ_FULL atomic_t tick_dep_mask; #endif struct pid *tty_old_pgrp; /* boolean value for session group leader */ int leader; struct tty_struct *tty; /* NULL if no tty */ #ifdef CONFIG_SCHED_AUTOGROUP struct autogroup *autogroup; #endif /* * Cumulative resource counters for dead threads in the group, * and for reaped dead child processes forked by this group. * Live threads maintain their own counters and add to these * in __exit_signal, except for the group leader. */ seqlock_t stats_lock; u64 utime, stime, cutime, cstime; u64 gtime; u64 cgtime; struct prev_cputime prev_cputime; unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; unsigned long inblock, oublock, cinblock, coublock; unsigned long maxrss, cmaxrss; struct task_io_accounting ioac; /* * Cumulative ns of schedule CPU time fo dead threads in the * group, not including a zombie group leader, (This only differs * from jiffies_to_ns(utime + stime) if sched_clock uses something * other than jiffies.) */ unsigned long long sum_sched_runtime; /* * We don't bother to synchronize most readers of this at all, * because there is no reader checking a limit that actually needs * to get both rlim_cur and rlim_max atomically, and either one * alone is a single word that can safely be read normally. * getrlimit/setrlimit use task_lock(current->group_leader) to * protect this instead of the siglock, because they really * have no need to disable irqs. */ struct rlimit rlim[RLIM_NLIMITS]; #ifdef CONFIG_BSD_PROCESS_ACCT struct pacct_struct pacct; /* per-process accounting information */ #endif #ifdef CONFIG_TASKSTATS struct taskstats *stats; #endif #ifdef CONFIG_AUDIT unsigned audit_tty; struct tty_audit_buf *tty_audit_buf; #endif /* * Thread is the potential origin of an oom condition; kill first on * oom */ bool oom_flag_origin; short oom_score_adj; /* OOM kill score adjustment */ short oom_score_adj_min; /* OOM kill score adjustment min value. * Only settable by CAP_SYS_RESOURCE. */ struct mm_struct *oom_mm; /* recorded mm when the thread group got * killed by the oom killer */ struct mutex cred_guard_mutex; /* guard against foreign influences on * credential calculations * (notably. ptrace) * Deprecated do not use in new code. * Use exec_update_lock instead. */ struct rw_semaphore exec_update_lock; /* Held while task_struct is * being updated during exec, * and may have inconsistent * permissions. */ } __randomize_layout; /* * Bits in flags field of signal_struct. */ #define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */ #define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */ #define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */ #define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */ /* * Pending notifications to parent. */ #define SIGNAL_CLD_STOPPED 0x00000010 #define SIGNAL_CLD_CONTINUED 0x00000020 #define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED) #define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */ #define SIGNAL_STOP_MASK (SIGNAL_CLD_MASK | SIGNAL_STOP_STOPPED | \ SIGNAL_STOP_CONTINUED) static inline void signal_set_stop_flags(struct signal_struct *sig, unsigned int flags) { WARN_ON(sig->flags & (SIGNAL_GROUP_EXIT|SIGNAL_GROUP_COREDUMP)); sig->flags = (sig->flags & ~SIGNAL_STOP_MASK) | flags; } /* If true, all threads except ->group_exit_task have pending SIGKILL */ static inline int signal_group_exit(const struct signal_struct *sig) { return (sig->flags & SIGNAL_GROUP_EXIT) || (sig->group_exit_task != NULL); } extern void flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *task, sigset_t *mask, kernel_siginfo_t *info); static inline int kernel_dequeue_signal(void) { struct task_struct *task = current; kernel_siginfo_t __info; int ret; spin_lock_irq(&task->sighand->siglock); ret = dequeue_signal(task, &task->blocked, &__info); spin_unlock_irq(&task->sighand->siglock); return ret; } static inline void kernel_signal_stop(void) { spin_lock_irq(&current->sighand->siglock); if (current->jobctl & JOBCTL_STOP_DEQUEUED) set_special_state(TASK_STOPPED); spin_unlock_irq(&current->sighand->siglock); schedule(); } #ifdef __ARCH_SI_TRAPNO # define ___ARCH_SI_TRAPNO(_a1) , _a1 #else # define ___ARCH_SI_TRAPNO(_a1) #endif #ifdef __ia64__ # define ___ARCH_SI_IA64(_a1, _a2, _a3) , _a1, _a2, _a3 #else # define ___ARCH_SI_IA64(_a1, _a2, _a3) #endif int force_sig_fault_to_task(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); int force_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr)); int send_sig_fault(int sig, int code, void __user *addr ___ARCH_SI_TRAPNO(int trapno) ___ARCH_SI_IA64(int imm, unsigned int flags, unsigned long isr) , struct task_struct *t); int force_sig_mceerr(int code, void __user *, short); int send_sig_mceerr(int code, void __user *, short, struct task_struct *); int force_sig_bnderr(void __user *addr, void __user *lower, void __user *upper); int force_sig_pkuerr(void __user *addr, u32 pkey); int force_sig_ptrace_errno_trap(int errno, void __user *addr); extern int send_sig_info(int, struct kernel_siginfo *, struct task_struct *); extern void force_sigsegv(int sig); extern int force_sig_info(struct kernel_siginfo *); extern int __kill_pgrp_info(int sig, struct kernel_siginfo *info, struct pid *pgrp); extern int kill_pid_info(int sig, struct kernel_siginfo *info, struct pid *pid); extern int kill_pid_usb_asyncio(int sig, int errno, sigval_t addr, struct pid *, const struct cred *); extern int kill_pgrp(struct pid *pid, int sig, int priv); extern int kill_pid(struct pid *pid, int sig, int priv); extern __must_check bool do_notify_parent(struct task_struct *, int); extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent); extern void force_sig(int); extern int send_sig(int, struct task_struct *, int); extern int zap_other_threads(struct task_struct *p); extern struct sigqueue *sigqueue_alloc(void); extern void sigqueue_free(struct sigqueue *); extern int send_sigqueue(struct sigqueue *, struct pid *, enum pid_type); extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *); static inline int restart_syscall(void) { set_tsk_thread_flag(current, TIF_SIGPENDING); return -ERESTARTNOINTR; } static inline int signal_pending(struct task_struct *p) { return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING)); } static inline int __fatal_signal_pending(struct task_struct *p) { return unlikely(sigismember(&p->pending.signal, SIGKILL)); } static inline int fatal_signal_pending(struct task_struct *p) { return signal_pending(p) && __fatal_signal_pending(p); } static inline int signal_pending_state(long state, struct task_struct *p) { if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL))) return 0; if (!signal_pending(p)) return 0; return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p); } /* * This should only be used in fault handlers to decide whether we * should stop the current fault routine to handle the signals * instead, especially with the case where we've got interrupted with * a VM_FAULT_RETRY. */ static inline bool fault_signal_pending(vm_fault_t fault_flags, struct pt_regs *regs) { return unlikely((fault_flags & VM_FAULT_RETRY) && (fatal_signal_pending(current) || (user_mode(regs) && signal_pending(current)))); } /* * Reevaluate whether the task has signals pending delivery. * Wake the task if so. * This is required every time the blocked sigset_t changes. * callers must hold sighand->siglock. */ extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); extern void calculate_sigpending(void); extern void signal_wake_up_state(struct task_struct *t, unsigned int state); static inline void signal_wake_up(struct task_struct *t, bool resume) { signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); } static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) { signal_wake_up_state(t, resume ? __TASK_TRACED : 0); } void task_join_group_stop(struct task_struct *task); #ifdef TIF_RESTORE_SIGMASK /* * Legacy restore_sigmask accessors. These are inefficient on * SMP architectures because they require atomic operations. */ /** * set_restore_sigmask() - make sure saved_sigmask processing gets done * * This sets TIF_RESTORE_SIGMASK and ensures that the arch signal code * will run before returning to user mode, to process the flag. For * all callers, TIF_SIGPENDING is already set or it's no harm to set * it. TIF_RESTORE_SIGMASK need not be in the set of bits that the * arch code will notice on return to user mode, in case those bits * are scarce. We set TIF_SIGPENDING here to ensure that the arch * signal code always gets run when TIF_RESTORE_SIGMASK is set. */ static inline void set_restore_sigmask(void) { set_thread_flag(TIF_RESTORE_SIGMASK); } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { clear_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline void clear_restore_sigmask(void) { clear_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return test_tsk_thread_flag(task, TIF_RESTORE_SIGMASK); } static inline bool test_restore_sigmask(void) { return test_thread_flag(TIF_RESTORE_SIGMASK); } static inline bool test_and_clear_restore_sigmask(void) { return test_and_clear_thread_flag(TIF_RESTORE_SIGMASK); } #else /* TIF_RESTORE_SIGMASK */ /* Higher-quality implementation, used if TIF_RESTORE_SIGMASK doesn't exist. */ static inline void set_restore_sigmask(void) { current->restore_sigmask = true; } static inline void clear_tsk_restore_sigmask(struct task_struct *task) { task->restore_sigmask = false; } static inline void clear_restore_sigmask(void) { current->restore_sigmask = false; } static inline bool test_restore_sigmask(void) { return current->restore_sigmask; } static inline bool test_tsk_restore_sigmask(struct task_struct *task) { return task->restore_sigmask; } static inline bool test_and_clear_restore_sigmask(void) { if (!current->restore_sigmask) return false; current->restore_sigmask = false; return true; } #endif static inline void restore_saved_sigmask(void) { if (test_and_clear_restore_sigmask()) __set_current_blocked(&current->saved_sigmask); } extern int set_user_sigmask(const sigset_t __user *umask, size_t sigsetsize); static inline void restore_saved_sigmask_unless(bool interrupted) { if (interrupted) WARN_ON(!test_thread_flag(TIF_SIGPENDING)); else restore_saved_sigmask(); } static inline sigset_t *sigmask_to_save(void) { sigset_t *res = &current->blocked; if (unlikely(test_restore_sigmask())) res = &current->saved_sigmask; return res; } static inline int kill_cad_pid(int sig, int priv) { return kill_pid(cad_pid, sig, priv); } /* These can be the second arg to send_sig_info/send_group_sig_info. */ #define SEND_SIG_NOINFO ((struct kernel_siginfo *) 0) #define SEND_SIG_PRIV ((struct kernel_siginfo *) 1) static inline int __on_sig_stack(unsigned long sp) { #ifdef CONFIG_STACK_GROWSUP return sp >= current->sas_ss_sp && sp - current->sas_ss_sp < current->sas_ss_size; #else return sp > current->sas_ss_sp && sp - current->sas_ss_sp <= current->sas_ss_size; #endif } /* * True if we are on the alternate signal stack. */ static inline int on_sig_stack(unsigned long sp) { /* * If the signal stack is SS_AUTODISARM then, by construction, we * can't be on the signal stack unless user code deliberately set * SS_AUTODISARM when we were already on it. * * This improves reliability: if user state gets corrupted such that * the stack pointer points very close to the end of the signal stack, * then this check will enable the signal to be handled anyway. */ if (current->sas_ss_flags & SS_AUTODISARM) return 0; return __on_sig_stack(sp); } static inline int sas_ss_flags(unsigned long sp) { if (!current->sas_ss_size) return SS_DISABLE; return on_sig_stack(sp) ? SS_ONSTACK : 0; } static inline void sas_ss_reset(struct task_struct *p) { p->sas_ss_sp = 0; p->sas_ss_size = 0; p->sas_ss_flags = SS_DISABLE; } static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig) { if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp)) #ifdef CONFIG_STACK_GROWSUP return current->sas_ss_sp; #else return current->sas_ss_sp + current->sas_ss_size; #endif return sp; } extern void __cleanup_sighand(struct sighand_struct *); extern void flush_itimer_signals(void); #define tasklist_empty() \ list_empty(&init_task.tasks) #define next_task(p) \ list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) extern bool current_is_single_threaded(void); /* * Careful: do_each_thread/while_each_thread is a double loop so * 'break' will not work as expected - use goto instead. */ #define do_each_thread(g, t) \ for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do #define while_each_thread(g, t) \ while ((t = next_thread(t)) != g) #define __for_each_thread(signal, t) \ list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) /* Careful: this is a double loop, 'break' won't work as expected. */ #define for_each_process_thread(p, t) \ for_each_process(p) for_each_thread(p, t) typedef int (*proc_visitor)(struct task_struct *p, void *data); void walk_process_tree(struct task_struct *top, proc_visitor, void *); static inline struct pid *task_pid_type(struct task_struct *task, enum pid_type type) { struct pid *pid; if (type == PIDTYPE_PID) pid = task_pid(task); else pid = task->signal->pids[type]; return pid; } static inline struct pid *task_tgid(struct task_struct *task) { return task->signal->pids[PIDTYPE_TGID]; } /* * Without tasklist or RCU lock it is not safe to dereference * the result of task_pgrp/task_session even if task == current, * we can race with another thread doing sys_setsid/sys_setpgid. */ static inline struct pid *task_pgrp(struct task_struct *task) { return task->signal->pids[PIDTYPE_PGID]; } static inline struct pid *task_session(struct task_struct *task) { return task->signal->pids[PIDTYPE_SID]; } static inline int get_nr_threads(struct task_struct *task) { return task->signal->nr_threads; } static inline bool thread_group_leader(struct task_struct *p) { return p->exit_signal >= 0; } static inline bool same_thread_group(struct task_struct *p1, struct task_struct *p2) { return p1->signal == p2->signal; } static inline struct task_struct *next_thread(const struct task_struct *p) { return list_entry_rcu(p->thread_group.next, struct task_struct, thread_group); } static inline int thread_group_empty(struct task_struct *p) { return list_empty(&p->thread_group); } #define delay_group_leader(p) \ (thread_group_leader(p) && !thread_group_empty(p)) extern bool thread_group_exited(struct pid *pid); extern struct sighand_struct *__lock_task_sighand(struct task_struct *task, unsigned long *flags); static inline struct sighand_struct *lock_task_sighand(struct task_struct *task, unsigned long *flags) { struct sighand_struct *ret; ret = __lock_task_sighand(task, flags); (void)__cond_lock(&task->sighand->siglock, ret); return ret; } static inline void unlock_task_sighand(struct task_struct *task, unsigned long *flags) { spin_unlock_irqrestore(&task->sighand->siglock, *flags); } static inline unsigned long task_rlimit(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_cur); } static inline unsigned long task_rlimit_max(const struct task_struct *task, unsigned int limit) { return READ_ONCE(task->signal->rlim[limit].rlim_max); } static inline unsigned long rlimit(unsigned int limit) { return task_rlimit(current, limit); } static inline unsigned long rlimit_max(unsigned int limit) { return task_rlimit_max(current, limit); } #endif /* _LINUX_SCHED_SIGNAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IPv6 * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ /* * Fixes: * * Ralf Baechle : generic ipv6 checksum * <ralf@waldorf-gmbh.de> */ #ifndef _CHECKSUM_IPV6_H #define _CHECKSUM_IPV6_H #include <asm/types.h> #include <asm/byteorder.h> #include <net/ip.h> #include <asm/checksum.h> #include <linux/in6.h> #include <linux/tcp.h> #include <linux/ipv6.h> #ifndef _HAVE_ARCH_IPV6_CSUM __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum csum); #endif static inline __wsum ip6_compute_pseudo(struct sk_buff *skb, int proto) { return ~csum_unfold(csum_ipv6_magic(&ipv6_hdr(skb)->saddr, &ipv6_hdr(skb)->daddr, skb->len, proto, 0)); } static inline __wsum ip6_gro_compute_pseudo(struct sk_buff *skb, int proto) { const struct ipv6hdr *iph = skb_gro_network_header(skb); return ~csum_unfold(csum_ipv6_magic(&iph->saddr, &iph->daddr, skb_gro_len(skb), proto, 0)); } static __inline__ __sum16 tcp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline void __tcp_v6_send_check(struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr) { struct tcphdr *th = tcp_hdr(skb); if (skb->ip_summed == CHECKSUM_PARTIAL) { th->check = ~tcp_v6_check(skb->len, saddr, daddr, 0); skb->csum_start = skb_transport_header(skb) - skb->head; skb->csum_offset = offsetof(struct tcphdr, check); } else { th->check = tcp_v6_check(skb->len, saddr, daddr, csum_partial(th, th->doff << 2, skb->csum)); } } static inline void tcp_v6_gso_csum_prep(struct sk_buff *skb) { struct ipv6hdr *ipv6h = ipv6_hdr(skb); struct tcphdr *th = tcp_hdr(skb); ipv6h->payload_len = 0; th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0); } static inline __sum16 udp_v6_check(int len, const struct in6_addr *saddr, const struct in6_addr *daddr, __wsum base) { return csum_ipv6_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp6_set_csum(bool nocheck, struct sk_buff *skb, const struct in6_addr *saddr, const struct in6_addr *daddr, int len); int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_HWEIGHT_H #define _ASM_X86_HWEIGHT_H #include <asm/cpufeatures.h> #ifdef CONFIG_64BIT #define REG_IN "D" #define REG_OUT "a" #else #define REG_IN "a" #define REG_OUT "a" #endif static __always_inline unsigned int __arch_hweight32(unsigned int w) { unsigned int res; asm (ALTERNATIVE("call __sw_hweight32", "popcntl %1, %0", X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); return res; } static inline unsigned int __arch_hweight16(unsigned int w) { return __arch_hweight32(w & 0xffff); } static inline unsigned int __arch_hweight8(unsigned int w) { return __arch_hweight32(w & 0xff); } #ifdef CONFIG_X86_32 static inline unsigned long __arch_hweight64(__u64 w) { return __arch_hweight32((u32)w) + __arch_hweight32((u32)(w >> 32)); } #else static __always_inline unsigned long __arch_hweight64(__u64 w) { unsigned long res; asm (ALTERNATIVE("call __sw_hweight64", "popcntq %1, %0", X86_FEATURE_POPCNT) : "="REG_OUT (res) : REG_IN (w)); return res; } #endif /* CONFIG_X86_32 */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM net #if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NET_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/tracepoint.h> TRACE_EVENT(net_dev_start_xmit, TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), TP_ARGS(skb, dev), TP_STRUCT__entry( __string( name, dev->name ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( int, network_offset ) __field( bool, transport_offset_valid) __field( int, transport_offset) __field( u8, tx_flags ) __field( u16, gso_size ) __field( u16, gso_segs ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, dev->name); __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->network_offset = skb_network_offset(skb); __entry->transport_offset_valid = skb_transport_header_was_set(skb); __entry->transport_offset = skb_transport_offset(skb); __entry->tx_flags = skb_shinfo(skb)->tx_flags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, __entry->data_len, __entry->network_offset, __entry->transport_offset_valid, __entry->transport_offset, __entry->tx_flags, __entry->gso_size, __entry->gso_segs, __entry->gso_type) ); TRACE_EVENT(net_dev_xmit, TP_PROTO(struct sk_buff *skb, int rc, struct net_device *dev, unsigned int skb_len), TP_ARGS(skb, rc, dev, skb_len), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __field( int, rc ) __string( name, dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; __assign_str(name, dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u rc=%d", __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); TRACE_EVENT(net_dev_xmit_timeout, TP_PROTO(struct net_device *dev, int queue_index), TP_ARGS(dev, queue_index), TP_STRUCT__entry( __string( name, dev->name ) __string( driver, netdev_drivername(dev)) __field( int, queue_index ) ), TP_fast_assign( __assign_str(name, dev->name); __assign_str(driver, netdev_drivername(dev)); __entry->queue_index = queue_index; ), TP_printk("dev=%s driver=%s queue=%d", __get_str(name), __get_str(driver), __entry->queue_index) ); DECLARE_EVENT_CLASS(net_dev_template, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __string( name, skb->dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; __assign_str(name, skb->dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u", __get_str(name), __entry->skbaddr, __entry->len) ) DEFINE_EVENT(net_dev_template, net_dev_queue, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_receive_skb, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_rx, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __string( name, skb->dev->name ) __field( unsigned int, napi_id ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( u32, hash ) __field( bool, l4_hash ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( unsigned int, truesize ) __field( bool, mac_header_valid) __field( int, mac_header ) __field( unsigned char, nr_frags ) __field( u16, gso_size ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, skb->dev->name); #ifdef CONFIG_NET_RX_BUSY_POLL __entry->napi_id = skb->napi_id; #else __entry->napi_id = 0; #endif __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->hash = skb->hash; __entry->l4_hash = skb->l4_hash; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->truesize = skb->truesize; __entry->mac_header_valid = skb_mac_header_was_set(skb); __entry->mac_header = skb_mac_header(skb) - skb->data; __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->hash, __entry->l4_hash, __entry->len, __entry->data_len, __entry->truesize, __entry->mac_header_valid, __entry->mac_header, __entry->nr_frags, __entry->gso_size, __entry->gso_type) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field(int, ret) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), TP_ARGS(ret) ); #endif /* _TRACE_NET_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 /* SPDX-License-Identifier: GPL-2.0+ */ #undef TRACE_SYSTEM #define TRACE_SYSTEM rseq #if !defined(_TRACE_RSEQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_RSEQ_H #include <linux/tracepoint.h> #include <linux/types.h> TRACE_EVENT(rseq_update, TP_PROTO(struct task_struct *t), TP_ARGS(t), TP_STRUCT__entry( __field(s32, cpu_id) ), TP_fast_assign( __entry->cpu_id = raw_smp_processor_id(); ), TP_printk("cpu_id=%d", __entry->cpu_id) ); TRACE_EVENT(rseq_ip_fixup, TP_PROTO(unsigned long regs_ip, unsigned long start_ip, unsigned long post_commit_offset, unsigned long abort_ip), TP_ARGS(regs_ip, start_ip, post_commit_offset, abort_ip), TP_STRUCT__entry( __field(unsigned long, regs_ip) __field(unsigned long, start_ip) __field(unsigned long, post_commit_offset) __field(unsigned long, abort_ip) ), TP_fast_assign( __entry->regs_ip = regs_ip; __entry->start_ip = start_ip; __entry->post_commit_offset = post_commit_offset; __entry->abort_ip = abort_ip; ), TP_printk("regs_ip=0x%lx start_ip=0x%lx post_commit_offset=%lu abort_ip=0x%lx", __entry->regs_ip, __entry->start_ip, __entry->post_commit_offset, __entry->abort_ip) ); #endif /* _TRACE_SOCK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 // SPDX-License-Identifier: GPL-2.0-only /* * Implementation of the security services. * * Authors : Stephen Smalley, <sds@tycho.nsa.gov> * James Morris <jmorris@redhat.com> * * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * * Support for enhanced MLS infrastructure. * Support for context based audit filters. * * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> * * Added conditional policy language extensions * * Updated: Hewlett-Packard <paul@paul-moore.com> * * Added support for NetLabel * Added support for the policy capability bitmap * * Updated: Chad Sellers <csellers@tresys.com> * * Added validation of kernel classes and permissions * * Updated: KaiGai Kohei <kaigai@ak.jp.nec.com> * * Added support for bounds domain and audit messaged on masked permissions * * Updated: Guido Trentalancia <guido@trentalancia.com> * * Added support for runtime switching of the policy type * * Copyright (C) 2008, 2009 NEC Corporation * Copyright (C) 2006, 2007 Hewlett-Packard Development Company, L.P. * Copyright (C) 2004-2006 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004, 2006 Tresys Technology, LLC * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/errno.h> #include <linux/in.h> #include <linux/sched.h> #include <linux/audit.h> #include <linux/vmalloc.h> #include <net/netlabel.h> #include "flask.h" #include "avc.h" #include "avc_ss.h" #include "security.h" #include "context.h" #include "policydb.h" #include "sidtab.h" #include "services.h" #include "conditional.h" #include "mls.h" #include "objsec.h" #include "netlabel.h" #include "xfrm.h" #include "ebitmap.h" #include "audit.h" #include "policycap_names.h" struct convert_context_args { struct selinux_state *state; struct policydb *oldp; struct policydb *newp; }; struct selinux_policy_convert_data { struct convert_context_args args; struct sidtab_convert_params sidtab_params; }; /* Forward declaration. */ static int context_struct_to_string(struct policydb *policydb, struct context *context, char **scontext, u32 *scontext_len); static int sidtab_entry_to_string(struct policydb *policydb, struct sidtab *sidtab, struct sidtab_entry *entry, char **scontext, u32 *scontext_len); static void context_struct_compute_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd, struct extended_perms *xperms); static int selinux_set_mapping(struct policydb *pol, struct security_class_mapping *map, struct selinux_map *out_map) { u16 i, j; unsigned k; bool print_unknown_handle = false; /* Find number of classes in the input mapping */ if (!map) return -EINVAL; i = 0; while (map[i].name) i++; /* Allocate space for the class records, plus one for class zero */ out_map->mapping = kcalloc(++i, sizeof(*out_map->mapping), GFP_ATOMIC); if (!out_map->mapping) return -ENOMEM; /* Store the raw class and permission values */ j = 0; while (map[j].name) { struct security_class_mapping *p_in = map + (j++); struct selinux_mapping *p_out = out_map->mapping + j; /* An empty class string skips ahead */ if (!strcmp(p_in->name, "")) { p_out->num_perms = 0; continue; } p_out->value = string_to_security_class(pol, p_in->name); if (!p_out->value) { pr_info("SELinux: Class %s not defined in policy.\n", p_in->name); if (pol->reject_unknown) goto err; p_out->num_perms = 0; print_unknown_handle = true; continue; } k = 0; while (p_in->perms[k]) { /* An empty permission string skips ahead */ if (!*p_in->perms[k]) { k++; continue; } p_out->perms[k] = string_to_av_perm(pol, p_out->value, p_in->perms[k]); if (!p_out->perms[k]) { pr_info("SELinux: Permission %s in class %s not defined in policy.\n", p_in->perms[k], p_in->name); if (pol->reject_unknown) goto err; print_unknown_handle = true; } k++; } p_out->num_perms = k; } if (print_unknown_handle) pr_info("SELinux: the above unknown classes and permissions will be %s\n", pol->allow_unknown ? "allowed" : "denied"); out_map->size = i; return 0; err: kfree(out_map->mapping); out_map->mapping = NULL; return -EINVAL; } /* * Get real, policy values from mapped values */ static u16 unmap_class(struct selinux_map *map, u16 tclass) { if (tclass < map->size) return map->mapping[tclass].value; return tclass; } /* * Get kernel value for class from its policy value */ static u16 map_class(struct selinux_map *map, u16 pol_value) { u16 i; for (i = 1; i < map->size; i++) { if (map->mapping[i].value == pol_value) return i; } return SECCLASS_NULL; } static void map_decision(struct selinux_map *map, u16 tclass, struct av_decision *avd, int allow_unknown) { if (tclass < map->size) { struct selinux_mapping *mapping = &map->mapping[tclass]; unsigned int i, n = mapping->num_perms; u32 result; for (i = 0, result = 0; i < n; i++) { if (avd->allowed & mapping->perms[i]) result |= 1<<i; if (allow_unknown && !mapping->perms[i]) result |= 1<<i; } avd->allowed = result; for (i = 0, result = 0; i < n; i++) if (avd->auditallow & mapping->perms[i]) result |= 1<<i; avd->auditallow = result; for (i = 0, result = 0; i < n; i++) { if (avd->auditdeny & mapping->perms[i]) result |= 1<<i; if (!allow_unknown && !mapping->perms[i]) result |= 1<<i; } /* * In case the kernel has a bug and requests a permission * between num_perms and the maximum permission number, we * should audit that denial */ for (; i < (sizeof(u32)*8); i++) result |= 1<<i; avd->auditdeny = result; } } int security_mls_enabled(struct selinux_state *state) { int mls_enabled; struct selinux_policy *policy; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); mls_enabled = policy->policydb.mls_enabled; rcu_read_unlock(); return mls_enabled; } /* * Return the boolean value of a constraint expression * when it is applied to the specified source and target * security contexts. * * xcontext is a special beast... It is used by the validatetrans rules * only. For these rules, scontext is the context before the transition, * tcontext is the context after the transition, and xcontext is the context * of the process performing the transition. All other callers of * constraint_expr_eval should pass in NULL for xcontext. */ static int constraint_expr_eval(struct policydb *policydb, struct context *scontext, struct context *tcontext, struct context *xcontext, struct constraint_expr *cexpr) { u32 val1, val2; struct context *c; struct role_datum *r1, *r2; struct mls_level *l1, *l2; struct constraint_expr *e; int s[CEXPR_MAXDEPTH]; int sp = -1; for (e = cexpr; e; e = e->next) { switch (e->expr_type) { case CEXPR_NOT: BUG_ON(sp < 0); s[sp] = !s[sp]; break; case CEXPR_AND: BUG_ON(sp < 1); sp--; s[sp] &= s[sp + 1]; break; case CEXPR_OR: BUG_ON(sp < 1); sp--; s[sp] |= s[sp + 1]; break; case CEXPR_ATTR: if (sp == (CEXPR_MAXDEPTH - 1)) return 0; switch (e->attr) { case CEXPR_USER: val1 = scontext->user; val2 = tcontext->user; break; case CEXPR_TYPE: val1 = scontext->type; val2 = tcontext->type; break; case CEXPR_ROLE: val1 = scontext->role; val2 = tcontext->role; r1 = policydb->role_val_to_struct[val1 - 1]; r2 = policydb->role_val_to_struct[val2 - 1]; switch (e->op) { case CEXPR_DOM: s[++sp] = ebitmap_get_bit(&r1->dominates, val2 - 1); continue; case CEXPR_DOMBY: s[++sp] = ebitmap_get_bit(&r2->dominates, val1 - 1); continue; case CEXPR_INCOMP: s[++sp] = (!ebitmap_get_bit(&r1->dominates, val2 - 1) && !ebitmap_get_bit(&r2->dominates, val1 - 1)); continue; default: break; } break; case CEXPR_L1L2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_L1H2: l1 = &(scontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_H1L2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[0]); goto mls_ops; case CEXPR_H1H2: l1 = &(scontext->range.level[1]); l2 = &(tcontext->range.level[1]); goto mls_ops; case CEXPR_L1H1: l1 = &(scontext->range.level[0]); l2 = &(scontext->range.level[1]); goto mls_ops; case CEXPR_L2H2: l1 = &(tcontext->range.level[0]); l2 = &(tcontext->range.level[1]); goto mls_ops; mls_ops: switch (e->op) { case CEXPR_EQ: s[++sp] = mls_level_eq(l1, l2); continue; case CEXPR_NEQ: s[++sp] = !mls_level_eq(l1, l2); continue; case CEXPR_DOM: s[++sp] = mls_level_dom(l1, l2); continue; case CEXPR_DOMBY: s[++sp] = mls_level_dom(l2, l1); continue; case CEXPR_INCOMP: s[++sp] = mls_level_incomp(l2, l1); continue; default: BUG(); return 0; } break; default: BUG(); return 0; } switch (e->op) { case CEXPR_EQ: s[++sp] = (val1 == val2); break; case CEXPR_NEQ: s[++sp] = (val1 != val2); break; default: BUG(); return 0; } break; case CEXPR_NAMES: if (sp == (CEXPR_MAXDEPTH-1)) return 0; c = scontext; if (e->attr & CEXPR_TARGET) c = tcontext; else if (e->attr & CEXPR_XTARGET) { c = xcontext; if (!c) { BUG(); return 0; } } if (e->attr & CEXPR_USER) val1 = c->user; else if (e->attr & CEXPR_ROLE) val1 = c->role; else if (e->attr & CEXPR_TYPE) val1 = c->type; else { BUG(); return 0; } switch (e->op) { case CEXPR_EQ: s[++sp] = ebitmap_get_bit(&e->names, val1 - 1); break; case CEXPR_NEQ: s[++sp] = !ebitmap_get_bit(&e->names, val1 - 1); break; default: BUG(); return 0; } break; default: BUG(); return 0; } } BUG_ON(sp != 0); return s[0]; } /* * security_dump_masked_av - dumps masked permissions during * security_compute_av due to RBAC, MLS/Constraint and Type bounds. */ static int dump_masked_av_helper(void *k, void *d, void *args) { struct perm_datum *pdatum = d; char **permission_names = args; BUG_ON(pdatum->value < 1 || pdatum->value > 32); permission_names[pdatum->value - 1] = (char *)k; return 0; } static void security_dump_masked_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, u32 permissions, const char *reason) { struct common_datum *common_dat; struct class_datum *tclass_dat; struct audit_buffer *ab; char *tclass_name; char *scontext_name = NULL; char *tcontext_name = NULL; char *permission_names[32]; int index; u32 length; bool need_comma = false; if (!permissions) return; tclass_name = sym_name(policydb, SYM_CLASSES, tclass - 1); tclass_dat = policydb->class_val_to_struct[tclass - 1]; common_dat = tclass_dat->comdatum; /* init permission_names */ if (common_dat && hashtab_map(&common_dat->permissions.table, dump_masked_av_helper, permission_names) < 0) goto out; if (hashtab_map(&tclass_dat->permissions.table, dump_masked_av_helper, permission_names) < 0) goto out; /* get scontext/tcontext in text form */ if (context_struct_to_string(policydb, scontext, &scontext_name, &length) < 0) goto out; if (context_struct_to_string(policydb, tcontext, &tcontext_name, &length) < 0) goto out; /* audit a message */ ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); if (!ab) goto out; audit_log_format(ab, "op=security_compute_av reason=%s " "scontext=%s tcontext=%s tclass=%s perms=", reason, scontext_name, tcontext_name, tclass_name); for (index = 0; index < 32; index++) { u32 mask = (1 << index); if ((mask & permissions) == 0) continue; audit_log_format(ab, "%s%s", need_comma ? "," : "", permission_names[index] ? permission_names[index] : "????"); need_comma = true; } audit_log_end(ab); out: /* release scontext/tcontext */ kfree(tcontext_name); kfree(scontext_name); return; } /* * security_boundary_permission - drops violated permissions * on boundary constraint. */ static void type_attribute_bounds_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd) { struct context lo_scontext; struct context lo_tcontext, *tcontextp = tcontext; struct av_decision lo_avd; struct type_datum *source; struct type_datum *target; u32 masked = 0; source = policydb->type_val_to_struct[scontext->type - 1]; BUG_ON(!source); if (!source->bounds) return; target = policydb->type_val_to_struct[tcontext->type - 1]; BUG_ON(!target); memset(&lo_avd, 0, sizeof(lo_avd)); memcpy(&lo_scontext, scontext, sizeof(lo_scontext)); lo_scontext.type = source->bounds; if (target->bounds) { memcpy(&lo_tcontext, tcontext, sizeof(lo_tcontext)); lo_tcontext.type = target->bounds; tcontextp = &lo_tcontext; } context_struct_compute_av(policydb, &lo_scontext, tcontextp, tclass, &lo_avd, NULL); masked = ~lo_avd.allowed & avd->allowed; if (likely(!masked)) return; /* no masked permission */ /* mask violated permissions */ avd->allowed &= ~masked; /* audit masked permissions */ security_dump_masked_av(policydb, scontext, tcontext, tclass, masked, "bounds"); } /* * flag which drivers have permissions * only looking for ioctl based extended permssions */ void services_compute_xperms_drivers( struct extended_perms *xperms, struct avtab_node *node) { unsigned int i; if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { /* if one or more driver has all permissions allowed */ for (i = 0; i < ARRAY_SIZE(xperms->drivers.p); i++) xperms->drivers.p[i] |= node->datum.u.xperms->perms.p[i]; } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { /* if allowing permissions within a driver */ security_xperm_set(xperms->drivers.p, node->datum.u.xperms->driver); } /* If no ioctl commands are allowed, ignore auditallow and auditdeny */ if (node->key.specified & AVTAB_XPERMS_ALLOWED) xperms->len = 1; } /* * Compute access vectors and extended permissions based on a context * structure pair for the permissions in a particular class. */ static void context_struct_compute_av(struct policydb *policydb, struct context *scontext, struct context *tcontext, u16 tclass, struct av_decision *avd, struct extended_perms *xperms) { struct constraint_node *constraint; struct role_allow *ra; struct avtab_key avkey; struct avtab_node *node; struct class_datum *tclass_datum; struct ebitmap *sattr, *tattr; struct ebitmap_node *snode, *tnode; unsigned int i, j; avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (xperms) { memset(&xperms->drivers, 0, sizeof(xperms->drivers)); xperms->len = 0; } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { if (printk_ratelimit()) pr_warn("SELinux: Invalid class %hu\n", tclass); return; } tclass_datum = policydb->class_val_to_struct[tclass - 1]; /* * If a specific type enforcement rule was defined for * this permission check, then use it. */ avkey.target_class = tclass; avkey.specified = AVTAB_AV | AVTAB_XPERMS; sattr = &policydb->type_attr_map_array[scontext->type - 1]; tattr = &policydb->type_attr_map_array[tcontext->type - 1]; ebitmap_for_each_positive_bit(sattr, snode, i) { ebitmap_for_each_positive_bit(tattr, tnode, j) { avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb->te_avtab, &avkey); node; node = avtab_search_node_next(node, avkey.specified)) { if (node->key.specified == AVTAB_ALLOWED) avd->allowed |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITALLOW) avd->auditallow |= node->datum.u.data; else if (node->key.specified == AVTAB_AUDITDENY) avd->auditdeny &= node->datum.u.data; else if (xperms && (node->key.specified & AVTAB_XPERMS)) services_compute_xperms_drivers(xperms, node); } /* Check conditional av table for additional permissions */ cond_compute_av(&policydb->te_cond_avtab, &avkey, avd, xperms); } } /* * Remove any permissions prohibited by a constraint (this includes * the MLS policy). */ constraint = tclass_datum->constraints; while (constraint) { if ((constraint->permissions & (avd->allowed)) && !constraint_expr_eval(policydb, scontext, tcontext, NULL, constraint->expr)) { avd->allowed &= ~(constraint->permissions); } constraint = constraint->next; } /* * If checking process transition permission and the * role is changing, then check the (current_role, new_role) * pair. */ if (tclass == policydb->process_class && (avd->allowed & policydb->process_trans_perms) && scontext->role != tcontext->role) { for (ra = policydb->role_allow; ra; ra = ra->next) { if (scontext->role == ra->role && tcontext->role == ra->new_role) break; } if (!ra) avd->allowed &= ~policydb->process_trans_perms; } /* * If the given source and target types have boundary * constraint, lazy checks have to mask any violated * permission and notice it to userspace via audit. */ type_attribute_bounds_av(policydb, scontext, tcontext, tclass, avd); } static int security_validtrans_handle_fail(struct selinux_state *state, struct selinux_policy *policy, struct sidtab_entry *oentry, struct sidtab_entry *nentry, struct sidtab_entry *tentry, u16 tclass) { struct policydb *p = &policy->policydb; struct sidtab *sidtab = policy->sidtab; char *o = NULL, *n = NULL, *t = NULL; u32 olen, nlen, tlen; if (sidtab_entry_to_string(p, sidtab, oentry, &o, &olen)) goto out; if (sidtab_entry_to_string(p, sidtab, nentry, &n, &nlen)) goto out; if (sidtab_entry_to_string(p, sidtab, tentry, &t, &tlen)) goto out; audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR, "op=security_validate_transition seresult=denied" " oldcontext=%s newcontext=%s taskcontext=%s tclass=%s", o, n, t, sym_name(p, SYM_CLASSES, tclass-1)); out: kfree(o); kfree(n); kfree(t); if (!enforcing_enabled(state)) return 0; return -EPERM; } static int security_compute_validatetrans(struct selinux_state *state, u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass, bool user) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *oentry; struct sidtab_entry *nentry; struct sidtab_entry *tentry; struct class_datum *tclass_datum; struct constraint_node *constraint; u16 tclass; int rc = 0; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (!user) tclass = unmap_class(&policy->map, orig_tclass); else tclass = orig_tclass; if (!tclass || tclass > policydb->p_classes.nprim) { rc = -EINVAL; goto out; } tclass_datum = policydb->class_val_to_struct[tclass - 1]; oentry = sidtab_search_entry(sidtab, oldsid); if (!oentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, oldsid); rc = -EINVAL; goto out; } nentry = sidtab_search_entry(sidtab, newsid); if (!nentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, newsid); rc = -EINVAL; goto out; } tentry = sidtab_search_entry(sidtab, tasksid); if (!tentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tasksid); rc = -EINVAL; goto out; } constraint = tclass_datum->validatetrans; while (constraint) { if (!constraint_expr_eval(policydb, &oentry->context, &nentry->context, &tentry->context, constraint->expr)) { if (user) rc = -EPERM; else rc = security_validtrans_handle_fail(state, policy, oentry, nentry, tentry, tclass); goto out; } constraint = constraint->next; } out: rcu_read_unlock(); return rc; } int security_validate_transition_user(struct selinux_state *state, u32 oldsid, u32 newsid, u32 tasksid, u16 tclass) { return security_compute_validatetrans(state, oldsid, newsid, tasksid, tclass, true); } int security_validate_transition(struct selinux_state *state, u32 oldsid, u32 newsid, u32 tasksid, u16 orig_tclass) { return security_compute_validatetrans(state, oldsid, newsid, tasksid, orig_tclass, false); } /* * security_bounded_transition - check whether the given * transition is directed to bounded, or not. * It returns 0, if @newsid is bounded by @oldsid. * Otherwise, it returns error code. * * @oldsid : current security identifier * @newsid : destinated security identifier */ int security_bounded_transition(struct selinux_state *state, u32 old_sid, u32 new_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *old_entry, *new_entry; struct type_datum *type; int index; int rc; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; rc = -EINVAL; old_entry = sidtab_search_entry(sidtab, old_sid); if (!old_entry) { pr_err("SELinux: %s: unrecognized SID %u\n", __func__, old_sid); goto out; } rc = -EINVAL; new_entry = sidtab_search_entry(sidtab, new_sid); if (!new_entry) { pr_err("SELinux: %s: unrecognized SID %u\n", __func__, new_sid); goto out; } rc = 0; /* type/domain unchanged */ if (old_entry->context.type == new_entry->context.type) goto out; index = new_entry->context.type; while (true) { type = policydb->type_val_to_struct[index - 1]; BUG_ON(!type); /* not bounded anymore */ rc = -EPERM; if (!type->bounds) break; /* @newsid is bounded by @oldsid */ rc = 0; if (type->bounds == old_entry->context.type) break; index = type->bounds; } if (rc) { char *old_name = NULL; char *new_name = NULL; u32 length; if (!sidtab_entry_to_string(policydb, sidtab, old_entry, &old_name, &length) && !sidtab_entry_to_string(policydb, sidtab, new_entry, &new_name, &length)) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR, "op=security_bounded_transition " "seresult=denied " "oldcontext=%s newcontext=%s", old_name, new_name); } kfree(new_name); kfree(old_name); } out: rcu_read_unlock(); return rc; } static void avd_init(struct selinux_policy *policy, struct av_decision *avd) { avd->allowed = 0; avd->auditallow = 0; avd->auditdeny = 0xffffffff; if (policy) avd->seqno = policy->latest_granting; else avd->seqno = 0; avd->flags = 0; } void services_compute_xperms_decision(struct extended_perms_decision *xpermd, struct avtab_node *node) { unsigned int i; if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { if (xpermd->driver != node->datum.u.xperms->driver) return; } else if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { if (!security_xperm_test(node->datum.u.xperms->perms.p, xpermd->driver)) return; } else { BUG(); } if (node->key.specified == AVTAB_XPERMS_ALLOWED) { xpermd->used |= XPERMS_ALLOWED; if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p)); } if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { for (i = 0; i < ARRAY_SIZE(xpermd->allowed->p); i++) xpermd->allowed->p[i] |= node->datum.u.xperms->perms.p[i]; } } else if (node->key.specified == AVTAB_XPERMS_AUDITALLOW) { xpermd->used |= XPERMS_AUDITALLOW; if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { memset(xpermd->auditallow->p, 0xff, sizeof(xpermd->auditallow->p)); } if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { for (i = 0; i < ARRAY_SIZE(xpermd->auditallow->p); i++) xpermd->auditallow->p[i] |= node->datum.u.xperms->perms.p[i]; } } else if (node->key.specified == AVTAB_XPERMS_DONTAUDIT) { xpermd->used |= XPERMS_DONTAUDIT; if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLDRIVER) { memset(xpermd->dontaudit->p, 0xff, sizeof(xpermd->dontaudit->p)); } if (node->datum.u.xperms->specified == AVTAB_XPERMS_IOCTLFUNCTION) { for (i = 0; i < ARRAY_SIZE(xpermd->dontaudit->p); i++) xpermd->dontaudit->p[i] |= node->datum.u.xperms->perms.p[i]; } } else { BUG(); } } void security_compute_xperms_decision(struct selinux_state *state, u32 ssid, u32 tsid, u16 orig_tclass, u8 driver, struct extended_perms_decision *xpermd) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; struct context *scontext, *tcontext; struct avtab_key avkey; struct avtab_node *node; struct ebitmap *sattr, *tattr; struct ebitmap_node *snode, *tnode; unsigned int i, j; xpermd->driver = driver; xpermd->used = 0; memset(xpermd->allowed->p, 0, sizeof(xpermd->allowed->p)); memset(xpermd->auditallow->p, 0, sizeof(xpermd->auditallow->p)); memset(xpermd->dontaudit->p, 0, sizeof(xpermd->dontaudit->p)); rcu_read_lock(); if (!selinux_initialized(state)) goto allow; policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; goto out; } if (unlikely(!tclass || tclass > policydb->p_classes.nprim)) { pr_warn_ratelimited("SELinux: Invalid class %hu\n", tclass); goto out; } avkey.target_class = tclass; avkey.specified = AVTAB_XPERMS; sattr = &policydb->type_attr_map_array[scontext->type - 1]; tattr = &policydb->type_attr_map_array[tcontext->type - 1]; ebitmap_for_each_positive_bit(sattr, snode, i) { ebitmap_for_each_positive_bit(tattr, tnode, j) { avkey.source_type = i + 1; avkey.target_type = j + 1; for (node = avtab_search_node(&policydb->te_avtab, &avkey); node; node = avtab_search_node_next(node, avkey.specified)) services_compute_xperms_decision(xpermd, node); cond_compute_xperms(&policydb->te_cond_avtab, &avkey, xpermd); } } out: rcu_read_unlock(); return; allow: memset(xpermd->allowed->p, 0xff, sizeof(xpermd->allowed->p)); goto out; } /** * security_compute_av - Compute access vector decisions. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @avd: access vector decisions * @xperms: extended permissions * * Compute a set of access vector decisions based on the * SID pair (@ssid, @tsid) for the permissions in @tclass. */ void security_compute_av(struct selinux_state *state, u32 ssid, u32 tsid, u16 orig_tclass, struct av_decision *avd, struct extended_perms *xperms) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; u16 tclass; struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); policy = rcu_dereference(state->policy); avd_init(policy, avd); xperms->len = 0; if (!selinux_initialized(state)) goto allow; policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } /* permissive domain? */ if (ebitmap_get_bit(&policydb->permissive_map, scontext->type)) avd->flags |= AVD_FLAGS_PERMISSIVE; tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } tclass = unmap_class(&policy->map, orig_tclass); if (unlikely(orig_tclass && !tclass)) { if (policydb->allow_unknown) goto allow; goto out; } context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, xperms); map_decision(&policy->map, orig_tclass, avd, policydb->allow_unknown); out: rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; goto out; } void security_compute_av_user(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, struct av_decision *avd) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *scontext = NULL, *tcontext = NULL; rcu_read_lock(); policy = rcu_dereference(state->policy); avd_init(policy, avd); if (!selinux_initialized(state)) goto allow; policydb = &policy->policydb; sidtab = policy->sidtab; scontext = sidtab_search(sidtab, ssid); if (!scontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); goto out; } /* permissive domain? */ if (ebitmap_get_bit(&policydb->permissive_map, scontext->type)) avd->flags |= AVD_FLAGS_PERMISSIVE; tcontext = sidtab_search(sidtab, tsid); if (!tcontext) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); goto out; } if (unlikely(!tclass)) { if (policydb->allow_unknown) goto allow; goto out; } context_struct_compute_av(policydb, scontext, tcontext, tclass, avd, NULL); out: rcu_read_unlock(); return; allow: avd->allowed = 0xffffffff; goto out; } /* * Write the security context string representation of * the context structure `context' into a dynamically * allocated string of the correct size. Set `*scontext' * to point to this string and set `*scontext_len' to * the length of the string. */ static int context_struct_to_string(struct policydb *p, struct context *context, char **scontext, u32 *scontext_len) { char *scontextp; if (scontext) *scontext = NULL; *scontext_len = 0; if (context->len) { *scontext_len = context->len; if (scontext) { *scontext = kstrdup(context->str, GFP_ATOMIC); if (!(*scontext)) return -ENOMEM; } return 0; } /* Compute the size of the context. */ *scontext_len += strlen(sym_name(p, SYM_USERS, context->user - 1)) + 1; *scontext_len += strlen(sym_name(p, SYM_ROLES, context->role - 1)) + 1; *scontext_len += strlen(sym_name(p, SYM_TYPES, context->type - 1)) + 1; *scontext_len += mls_compute_context_len(p, context); if (!scontext) return 0; /* Allocate space for the context; caller must free this space. */ scontextp = kmalloc(*scontext_len, GFP_ATOMIC); if (!scontextp) return -ENOMEM; *scontext = scontextp; /* * Copy the user name, role name and type name into the context. */ scontextp += sprintf(scontextp, "%s:%s:%s", sym_name(p, SYM_USERS, context->user - 1), sym_name(p, SYM_ROLES, context->role - 1), sym_name(p, SYM_TYPES, context->type - 1)); mls_sid_to_context(p, context, &scontextp); *scontextp = 0; return 0; } static int sidtab_entry_to_string(struct policydb *p, struct sidtab *sidtab, struct sidtab_entry *entry, char **scontext, u32 *scontext_len) { int rc = sidtab_sid2str_get(sidtab, entry, scontext, scontext_len); if (rc != -ENOENT) return rc; rc = context_struct_to_string(p, &entry->context, scontext, scontext_len); if (!rc && scontext) sidtab_sid2str_put(sidtab, entry, *scontext, *scontext_len); return rc; } #include "initial_sid_to_string.h" int security_sidtab_hash_stats(struct selinux_state *state, char *page) { struct selinux_policy *policy; int rc; if (!selinux_initialized(state)) { pr_err("SELinux: %s: called before initial load_policy\n", __func__); return -EINVAL; } rcu_read_lock(); policy = rcu_dereference(state->policy); rc = sidtab_hash_stats(policy->sidtab, page); rcu_read_unlock(); return rc; } const char *security_get_initial_sid_context(u32 sid) { if (unlikely(sid > SECINITSID_NUM)) return NULL; return initial_sid_to_string[sid]; } static int security_sid_to_context_core(struct selinux_state *state, u32 sid, char **scontext, u32 *scontext_len, int force, int only_invalid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct sidtab_entry *entry; int rc = 0; if (scontext) *scontext = NULL; *scontext_len = 0; if (!selinux_initialized(state)) { if (sid <= SECINITSID_NUM) { char *scontextp; const char *s = initial_sid_to_string[sid]; if (!s) return -EINVAL; *scontext_len = strlen(s) + 1; if (!scontext) return 0; scontextp = kmemdup(s, *scontext_len, GFP_ATOMIC); if (!scontextp) return -ENOMEM; *scontext = scontextp; return 0; } pr_err("SELinux: %s: called before initial " "load_policy on unknown SID %d\n", __func__, sid); return -EINVAL; } rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (force) entry = sidtab_search_entry_force(sidtab, sid); else entry = sidtab_search_entry(sidtab, sid); if (!entry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, sid); rc = -EINVAL; goto out_unlock; } if (only_invalid && !entry->context.len) goto out_unlock; rc = sidtab_entry_to_string(policydb, sidtab, entry, scontext, scontext_len); out_unlock: rcu_read_unlock(); return rc; } /** * security_sid_to_context - Obtain a context for a given SID. * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes * * Write the string representation of the context associated with @sid * into a dynamically allocated string of the correct size. Set @scontext * to point to this string and set @scontext_len to the length of the string. */ int security_sid_to_context(struct selinux_state *state, u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(state, sid, scontext, scontext_len, 0, 0); } int security_sid_to_context_force(struct selinux_state *state, u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(state, sid, scontext, scontext_len, 1, 0); } /** * security_sid_to_context_inval - Obtain a context for a given SID if it * is invalid. * @sid: security identifier, SID * @scontext: security context * @scontext_len: length in bytes * * Write the string representation of the context associated with @sid * into a dynamically allocated string of the correct size, but only if the * context is invalid in the current policy. Set @scontext to point to * this string (or NULL if the context is valid) and set @scontext_len to * the length of the string (or 0 if the context is valid). */ int security_sid_to_context_inval(struct selinux_state *state, u32 sid, char **scontext, u32 *scontext_len) { return security_sid_to_context_core(state, sid, scontext, scontext_len, 1, 1); } /* * Caveat: Mutates scontext. */ static int string_to_context_struct(struct policydb *pol, struct sidtab *sidtabp, char *scontext, struct context *ctx, u32 def_sid) { struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *scontextp, *p, oldc; int rc = 0; context_init(ctx); /* Parse the security context. */ rc = -EINVAL; scontextp = (char *) scontext; /* Extract the user. */ p = scontextp; while (*p && *p != ':') p++; if (*p == 0) goto out; *p++ = 0; usrdatum = symtab_search(&pol->p_users, scontextp); if (!usrdatum) goto out; ctx->user = usrdatum->value; /* Extract role. */ scontextp = p; while (*p && *p != ':') p++; if (*p == 0) goto out; *p++ = 0; role = symtab_search(&pol->p_roles, scontextp); if (!role) goto out; ctx->role = role->value; /* Extract type. */ scontextp = p; while (*p && *p != ':') p++; oldc = *p; *p++ = 0; typdatum = symtab_search(&pol->p_types, scontextp); if (!typdatum || typdatum->attribute) goto out; ctx->type = typdatum->value; rc = mls_context_to_sid(pol, oldc, p, ctx, sidtabp, def_sid); if (rc) goto out; /* Check the validity of the new context. */ rc = -EINVAL; if (!policydb_context_isvalid(pol, ctx)) goto out; rc = 0; out: if (rc) context_destroy(ctx); return rc; } static int security_context_to_sid_core(struct selinux_state *state, const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags, int force) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; char *scontext2, *str = NULL; struct context context; int rc = 0; /* An empty security context is never valid. */ if (!scontext_len) return -EINVAL; /* Copy the string to allow changes and ensure a NUL terminator */ scontext2 = kmemdup_nul(scontext, scontext_len, gfp_flags); if (!scontext2) return -ENOMEM; if (!selinux_initialized(state)) { int i; for (i = 1; i < SECINITSID_NUM; i++) { const char *s = initial_sid_to_string[i]; if (s && !strcmp(s, scontext2)) { *sid = i; goto out; } } *sid = SECINITSID_KERNEL; goto out; } *sid = SECSID_NULL; if (force) { /* Save another copy for storing in uninterpreted form */ rc = -ENOMEM; str = kstrdup(scontext2, gfp_flags); if (!str) goto out; } retry: rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; rc = string_to_context_struct(policydb, sidtab, scontext2, &context, def_sid); if (rc == -EINVAL && force) { context.str = str; context.len = strlen(str) + 1; str = NULL; } else if (rc) goto out_unlock; rc = sidtab_context_to_sid(sidtab, &context, sid); if (rc == -ESTALE) { rcu_read_unlock(); if (context.str) { str = context.str; context.str = NULL; } context_destroy(&context); goto retry; } context_destroy(&context); out_unlock: rcu_read_unlock(); out: kfree(scontext2); kfree(str); return rc; } /** * security_context_to_sid - Obtain a SID for a given security context. * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID * @gfp: context for the allocation * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ int security_context_to_sid(struct selinux_state *state, const char *scontext, u32 scontext_len, u32 *sid, gfp_t gfp) { return security_context_to_sid_core(state, scontext, scontext_len, sid, SECSID_NULL, gfp, 0); } int security_context_str_to_sid(struct selinux_state *state, const char *scontext, u32 *sid, gfp_t gfp) { return security_context_to_sid(state, scontext, strlen(scontext), sid, gfp); } /** * security_context_to_sid_default - Obtain a SID for a given security context, * falling back to specified default if needed. * * @scontext: security context * @scontext_len: length in bytes * @sid: security identifier, SID * @def_sid: default SID to assign on error * * Obtains a SID associated with the security context that * has the string representation specified by @scontext. * The default SID is passed to the MLS layer to be used to allow * kernel labeling of the MLS field if the MLS field is not present * (for upgrading to MLS without full relabel). * Implicitly forces adding of the context even if it cannot be mapped yet. * Returns -%EINVAL if the context is invalid, -%ENOMEM if insufficient * memory is available, or 0 on success. */ int security_context_to_sid_default(struct selinux_state *state, const char *scontext, u32 scontext_len, u32 *sid, u32 def_sid, gfp_t gfp_flags) { return security_context_to_sid_core(state, scontext, scontext_len, sid, def_sid, gfp_flags, 1); } int security_context_to_sid_force(struct selinux_state *state, const char *scontext, u32 scontext_len, u32 *sid) { return security_context_to_sid_core(state, scontext, scontext_len, sid, SECSID_NULL, GFP_KERNEL, 1); } static int compute_sid_handle_invalid_context( struct selinux_state *state, struct selinux_policy *policy, struct sidtab_entry *sentry, struct sidtab_entry *tentry, u16 tclass, struct context *newcontext) { struct policydb *policydb = &policy->policydb; struct sidtab *sidtab = policy->sidtab; char *s = NULL, *t = NULL, *n = NULL; u32 slen, tlen, nlen; struct audit_buffer *ab; if (sidtab_entry_to_string(policydb, sidtab, sentry, &s, &slen)) goto out; if (sidtab_entry_to_string(policydb, sidtab, tentry, &t, &tlen)) goto out; if (context_struct_to_string(policydb, newcontext, &n, &nlen)) goto out; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); audit_log_format(ab, "op=security_compute_sid invalid_context="); /* no need to record the NUL with untrusted strings */ audit_log_n_untrustedstring(ab, n, nlen - 1); audit_log_format(ab, " scontext=%s tcontext=%s tclass=%s", s, t, sym_name(policydb, SYM_CLASSES, tclass-1)); audit_log_end(ab); out: kfree(s); kfree(t); kfree(n); if (!enforcing_enabled(state)) return 0; return -EACCES; } static void filename_compute_type(struct policydb *policydb, struct context *newcontext, u32 stype, u32 ttype, u16 tclass, const char *objname) { struct filename_trans_key ft; struct filename_trans_datum *datum; /* * Most filename trans rules are going to live in specific directories * like /dev or /var/run. This bitmap will quickly skip rule searches * if the ttype does not contain any rules. */ if (!ebitmap_get_bit(&policydb->filename_trans_ttypes, ttype)) return; ft.ttype = ttype; ft.tclass = tclass; ft.name = objname; datum = policydb_filenametr_search(policydb, &ft); while (datum) { if (ebitmap_get_bit(&datum->stypes, stype - 1)) { newcontext->type = datum->otype; return; } datum = datum->next; } } static int security_compute_sid(struct selinux_state *state, u32 ssid, u32 tsid, u16 orig_tclass, u32 specified, const char *objname, u32 *out_sid, bool kern) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct class_datum *cladatum; struct context *scontext, *tcontext, newcontext; struct sidtab_entry *sentry, *tentry; struct avtab_key avkey; struct avtab_datum *avdatum; struct avtab_node *node; u16 tclass; int rc = 0; bool sock; if (!selinux_initialized(state)) { switch (orig_tclass) { case SECCLASS_PROCESS: /* kernel value */ *out_sid = ssid; break; default: *out_sid = tsid; break; } goto out; } retry: cladatum = NULL; context_init(&newcontext); rcu_read_lock(); policy = rcu_dereference(state->policy); if (kern) { tclass = unmap_class(&policy->map, orig_tclass); sock = security_is_socket_class(orig_tclass); } else { tclass = orig_tclass; sock = security_is_socket_class(map_class(&policy->map, tclass)); } policydb = &policy->policydb; sidtab = policy->sidtab; sentry = sidtab_search_entry(sidtab, ssid); if (!sentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, ssid); rc = -EINVAL; goto out_unlock; } tentry = sidtab_search_entry(sidtab, tsid); if (!tentry) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, tsid); rc = -EINVAL; goto out_unlock; } scontext = &sentry->context; tcontext = &tentry->context; if (tclass && tclass <= policydb->p_classes.nprim) cladatum = policydb->class_val_to_struct[tclass - 1]; /* Set the user identity. */ switch (specified) { case AVTAB_TRANSITION: case AVTAB_CHANGE: if (cladatum && cladatum->default_user == DEFAULT_TARGET) { newcontext.user = tcontext->user; } else { /* notice this gets both DEFAULT_SOURCE and unset */ /* Use the process user identity. */ newcontext.user = scontext->user; } break; case AVTAB_MEMBER: /* Use the related object owner. */ newcontext.user = tcontext->user; break; } /* Set the role to default values. */ if (cladatum && cladatum->default_role == DEFAULT_SOURCE) { newcontext.role = scontext->role; } else if (cladatum && cladatum->default_role == DEFAULT_TARGET) { newcontext.role = tcontext->role; } else { if ((tclass == policydb->process_class) || sock) newcontext.role = scontext->role; else newcontext.role = OBJECT_R_VAL; } /* Set the type to default values. */ if (cladatum && cladatum->default_type == DEFAULT_SOURCE) { newcontext.type = scontext->type; } else if (cladatum && cladatum->default_type == DEFAULT_TARGET) { newcontext.type = tcontext->type; } else { if ((tclass == policydb->process_class) || sock) { /* Use the type of process. */ newcontext.type = scontext->type; } else { /* Use the type of the related object. */ newcontext.type = tcontext->type; } } /* Look for a type transition/member/change rule. */ avkey.source_type = scontext->type; avkey.target_type = tcontext->type; avkey.target_class = tclass; avkey.specified = specified; avdatum = avtab_search(&policydb->te_avtab, &avkey); /* If no permanent rule, also check for enabled conditional rules */ if (!avdatum) { node = avtab_search_node(&policydb->te_cond_avtab, &avkey); for (; node; node = avtab_search_node_next(node, specified)) { if (node->key.specified & AVTAB_ENABLED) { avdatum = &node->datum; break; } } } if (avdatum) { /* Use the type from the type transition/member/change rule. */ newcontext.type = avdatum->u.data; } /* if we have a objname this is a file trans check so check those rules */ if (objname) filename_compute_type(policydb, &newcontext, scontext->type, tcontext->type, tclass, objname); /* Check for class-specific changes. */ if (specified & AVTAB_TRANSITION) { /* Look for a role transition rule. */ struct role_trans_datum *rtd; struct role_trans_key rtk = { .role = scontext->role, .type = tcontext->type, .tclass = tclass, }; rtd = policydb_roletr_search(policydb, &rtk); if (rtd) newcontext.role = rtd->new_role; } /* Set the MLS attributes. This is done last because it may allocate memory. */ rc = mls_compute_sid(policydb, scontext, tcontext, tclass, specified, &newcontext, sock); if (rc) goto out_unlock; /* Check the validity of the context. */ if (!policydb_context_isvalid(policydb, &newcontext)) { rc = compute_sid_handle_invalid_context(state, policy, sentry, tentry, tclass, &newcontext); if (rc) goto out_unlock; } /* Obtain the sid for the context. */ rc = sidtab_context_to_sid(sidtab, &newcontext, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); context_destroy(&newcontext); goto retry; } out_unlock: rcu_read_unlock(); context_destroy(&newcontext); out: return rc; } /** * security_transition_sid - Compute the SID for a new subject/object. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for new subject/object * * Compute a SID to use for labeling a new subject or object in the * class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the new SID was * computed successfully. */ int security_transition_sid(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, const struct qstr *qstr, u32 *out_sid) { return security_compute_sid(state, ssid, tsid, tclass, AVTAB_TRANSITION, qstr ? qstr->name : NULL, out_sid, true); } int security_transition_sid_user(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, const char *objname, u32 *out_sid) { return security_compute_sid(state, ssid, tsid, tclass, AVTAB_TRANSITION, objname, out_sid, false); } /** * security_member_sid - Compute the SID for member selection. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use when selecting a member of a polyinstantiated * object of class @tclass based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_member_sid(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(state, ssid, tsid, tclass, AVTAB_MEMBER, NULL, out_sid, false); } /** * security_change_sid - Compute the SID for object relabeling. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @out_sid: security identifier for selected member * * Compute a SID to use for relabeling an object of class @tclass * based on a SID pair (@ssid, @tsid). * Return -%EINVAL if any of the parameters are invalid, -%ENOMEM * if insufficient memory is available, or %0 if the SID was * computed successfully. */ int security_change_sid(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 *out_sid) { return security_compute_sid(state, ssid, tsid, tclass, AVTAB_CHANGE, NULL, out_sid, false); } static inline int convert_context_handle_invalid_context( struct selinux_state *state, struct policydb *policydb, struct context *context) { char *s; u32 len; if (enforcing_enabled(state)) return -EINVAL; if (!context_struct_to_string(policydb, context, &s, &len)) { pr_warn("SELinux: Context %s would be invalid if enforcing\n", s); kfree(s); } return 0; } /* * Convert the values in the security context * structure `oldc' from the values specified * in the policy `p->oldp' to the values specified * in the policy `p->newp', storing the new context * in `newc'. Verify that the context is valid * under the new policy. */ static int convert_context(struct context *oldc, struct context *newc, void *p) { struct convert_context_args *args; struct ocontext *oc; struct role_datum *role; struct type_datum *typdatum; struct user_datum *usrdatum; char *s; u32 len; int rc; args = p; if (oldc->str) { s = kstrdup(oldc->str, GFP_KERNEL); if (!s) return -ENOMEM; rc = string_to_context_struct(args->newp, NULL, s, newc, SECSID_NULL); if (rc == -EINVAL) { /* * Retain string representation for later mapping. * * IMPORTANT: We need to copy the contents of oldc->str * back into s again because string_to_context_struct() * may have garbled it. */ memcpy(s, oldc->str, oldc->len); context_init(newc); newc->str = s; newc->len = oldc->len; return 0; } kfree(s); if (rc) { /* Other error condition, e.g. ENOMEM. */ pr_err("SELinux: Unable to map context %s, rc = %d.\n", oldc->str, -rc); return rc; } pr_info("SELinux: Context %s became valid (mapped).\n", oldc->str); return 0; } context_init(newc); /* Convert the user. */ rc = -EINVAL; usrdatum = symtab_search(&args->newp->p_users, sym_name(args->oldp, SYM_USERS, oldc->user - 1)); if (!usrdatum) goto bad; newc->user = usrdatum->value; /* Convert the role. */ rc = -EINVAL; role = symtab_search(&args->newp->p_roles, sym_name(args->oldp, SYM_ROLES, oldc->role - 1)); if (!role) goto bad; newc->role = role->value; /* Convert the type. */ rc = -EINVAL; typdatum = symtab_search(&args->newp->p_types, sym_name(args->oldp, SYM_TYPES, oldc->type - 1)); if (!typdatum) goto bad; newc->type = typdatum->value; /* Convert the MLS fields if dealing with MLS policies */ if (args->oldp->mls_enabled && args->newp->mls_enabled) { rc = mls_convert_context(args->oldp, args->newp, oldc, newc); if (rc) goto bad; } else if (!args->oldp->mls_enabled && args->newp->mls_enabled) { /* * Switching between non-MLS and MLS policy: * ensure that the MLS fields of the context for all * existing entries in the sidtab are filled in with a * suitable default value, likely taken from one of the * initial SIDs. */ oc = args->newp->ocontexts[OCON_ISID]; while (oc && oc->sid[0] != SECINITSID_UNLABELED) oc = oc->next; rc = -EINVAL; if (!oc) { pr_err("SELinux: unable to look up" " the initial SIDs list\n"); goto bad; } rc = mls_range_set(newc, &oc->context[0].range); if (rc) goto bad; } /* Check the validity of the new context. */ if (!policydb_context_isvalid(args->newp, newc)) { rc = convert_context_handle_invalid_context(args->state, args->oldp, oldc); if (rc) goto bad; } return 0; bad: /* Map old representation to string and save it. */ rc = context_struct_to_string(args->oldp, oldc, &s, &len); if (rc) return rc; context_destroy(newc); newc->str = s; newc->len = len; pr_info("SELinux: Context %s became invalid (unmapped).\n", newc->str); return 0; } static void security_load_policycaps(struct selinux_state *state, struct selinux_policy *policy) { struct policydb *p; unsigned int i; struct ebitmap_node *node; p = &policy->policydb; for (i = 0; i < ARRAY_SIZE(state->policycap); i++) WRITE_ONCE(state->policycap[i], ebitmap_get_bit(&p->policycaps, i)); for (i = 0; i < ARRAY_SIZE(selinux_policycap_names); i++) pr_info("SELinux: policy capability %s=%d\n", selinux_policycap_names[i], ebitmap_get_bit(&p->policycaps, i)); ebitmap_for_each_positive_bit(&p->policycaps, node, i) { if (i >= ARRAY_SIZE(selinux_policycap_names)) pr_info("SELinux: unknown policy capability %u\n", i); } } static int security_preserve_bools(struct selinux_policy *oldpolicy, struct selinux_policy *newpolicy); static void selinux_policy_free(struct selinux_policy *policy) { if (!policy) return; sidtab_destroy(policy->sidtab); kfree(policy->map.mapping); policydb_destroy(&policy->policydb); kfree(policy->sidtab); kfree(policy); } static void selinux_policy_cond_free(struct selinux_policy *policy) { cond_policydb_destroy_dup(&policy->policydb); kfree(policy); } void selinux_policy_cancel(struct selinux_state *state, struct selinux_load_state *load_state) { struct selinux_policy *oldpolicy; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); sidtab_cancel_convert(oldpolicy->sidtab); selinux_policy_free(load_state->policy); kfree(load_state->convert_data); } static void selinux_notify_policy_change(struct selinux_state *state, u32 seqno) { /* Flush external caches and notify userspace of policy load */ avc_ss_reset(state->avc, seqno); selnl_notify_policyload(seqno); selinux_status_update_policyload(state, seqno); selinux_netlbl_cache_invalidate(); selinux_xfrm_notify_policyload(); } void selinux_policy_commit(struct selinux_state *state, struct selinux_load_state *load_state) { struct selinux_policy *oldpolicy, *newpolicy = load_state->policy; unsigned long flags; u32 seqno; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* If switching between different policy types, log MLS status */ if (oldpolicy) { if (oldpolicy->policydb.mls_enabled && !newpolicy->policydb.mls_enabled) pr_info("SELinux: Disabling MLS support...\n"); else if (!oldpolicy->policydb.mls_enabled && newpolicy->policydb.mls_enabled) pr_info("SELinux: Enabling MLS support...\n"); } /* Set latest granting seqno for new policy. */ if (oldpolicy) newpolicy->latest_granting = oldpolicy->latest_granting + 1; else newpolicy->latest_granting = 1; seqno = newpolicy->latest_granting; /* Install the new policy. */ if (oldpolicy) { sidtab_freeze_begin(oldpolicy->sidtab, &flags); rcu_assign_pointer(state->policy, newpolicy); sidtab_freeze_end(oldpolicy->sidtab, &flags); } else { rcu_assign_pointer(state->policy, newpolicy); } /* Load the policycaps from the new policy */ security_load_policycaps(state, newpolicy); if (!selinux_initialized(state)) { /* * After first policy load, the security server is * marked as initialized and ready to handle requests and * any objects created prior to policy load are then labeled. */ selinux_mark_initialized(state); selinux_complete_init(); } /* Free the old policy */ synchronize_rcu(); selinux_policy_free(oldpolicy); kfree(load_state->convert_data); /* Notify others of the policy change */ selinux_notify_policy_change(state, seqno); } /** * security_load_policy - Load a security policy configuration. * @data: binary policy data * @len: length of data in bytes * * Load a new set of security policy configuration data, * validate it and convert the SID table as necessary. * This function will flush the access vector cache after * loading the new policy. */ int security_load_policy(struct selinux_state *state, void *data, size_t len, struct selinux_load_state *load_state) { struct selinux_policy *newpolicy, *oldpolicy; struct selinux_policy_convert_data *convert_data; int rc = 0; struct policy_file file = { data, len }, *fp = &file; newpolicy = kzalloc(sizeof(*newpolicy), GFP_KERNEL); if (!newpolicy) return -ENOMEM; newpolicy->sidtab = kzalloc(sizeof(*newpolicy->sidtab), GFP_KERNEL); if (!newpolicy->sidtab) { rc = -ENOMEM; goto err_policy; } rc = policydb_read(&newpolicy->policydb, fp); if (rc) goto err_sidtab; newpolicy->policydb.len = len; rc = selinux_set_mapping(&newpolicy->policydb, secclass_map, &newpolicy->map); if (rc) goto err_policydb; rc = policydb_load_isids(&newpolicy->policydb, newpolicy->sidtab); if (rc) { pr_err("SELinux: unable to load the initial SIDs\n"); goto err_mapping; } if (!selinux_initialized(state)) { /* First policy load, so no need to preserve state from old policy */ load_state->policy = newpolicy; load_state->convert_data = NULL; return 0; } oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* Preserve active boolean values from the old policy */ rc = security_preserve_bools(oldpolicy, newpolicy); if (rc) { pr_err("SELinux: unable to preserve booleans\n"); goto err_free_isids; } convert_data = kmalloc(sizeof(*convert_data), GFP_KERNEL); if (!convert_data) { rc = -ENOMEM; goto err_free_isids; } /* * Convert the internal representations of contexts * in the new SID table. */ convert_data->args.state = state; convert_data->args.oldp = &oldpolicy->policydb; convert_data->args.newp = &newpolicy->policydb; convert_data->sidtab_params.func = convert_context; convert_data->sidtab_params.args = &convert_data->args; convert_data->sidtab_params.target = newpolicy->sidtab; rc = sidtab_convert(oldpolicy->sidtab, &convert_data->sidtab_params); if (rc) { pr_err("SELinux: unable to convert the internal" " representation of contexts in the new SID" " table\n"); goto err_free_convert_data; } load_state->policy = newpolicy; load_state->convert_data = convert_data; return 0; err_free_convert_data: kfree(convert_data); err_free_isids: sidtab_destroy(newpolicy->sidtab); err_mapping: kfree(newpolicy->map.mapping); err_policydb: policydb_destroy(&newpolicy->policydb); err_sidtab: kfree(newpolicy->sidtab); err_policy: kfree(newpolicy); return rc; } /** * ocontext_to_sid - Helper to safely get sid for an ocontext * @sidtab: SID table * @c: ocontext structure * @index: index of the context entry (0 or 1) * @out_sid: pointer to the resulting SID value * * For all ocontexts except OCON_ISID the SID fields are populated * on-demand when needed. Since updating the SID value is an SMP-sensitive * operation, this helper must be used to do that safely. * * WARNING: This function may return -ESTALE, indicating that the caller * must retry the operation after re-acquiring the policy pointer! */ static int ocontext_to_sid(struct sidtab *sidtab, struct ocontext *c, size_t index, u32 *out_sid) { int rc; u32 sid; /* Ensure the associated sidtab entry is visible to this thread. */ sid = smp_load_acquire(&c->sid[index]); if (!sid) { rc = sidtab_context_to_sid(sidtab, &c->context[index], &sid); if (rc) return rc; /* * Ensure the new sidtab entry is visible to other threads * when they see the SID. */ smp_store_release(&c->sid[index], sid); } *out_sid = sid; return 0; } /** * security_port_sid - Obtain the SID for a port. * @protocol: protocol number * @port: port number * @out_sid: security identifier */ int security_port_sid(struct selinux_state *state, u8 protocol, u16 port, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized(state)) { *out_sid = SECINITSID_PORT; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_PORT]; while (c) { if (c->u.port.protocol == protocol && c->u.port.low_port <= port && c->u.port.high_port >= port) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { *out_sid = SECINITSID_PORT; } out: rcu_read_unlock(); return rc; } /** * security_pkey_sid - Obtain the SID for a pkey. * @subnet_prefix: Subnet Prefix * @pkey_num: pkey number * @out_sid: security identifier */ int security_ib_pkey_sid(struct selinux_state *state, u64 subnet_prefix, u16 pkey_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized(state)) { *out_sid = SECINITSID_UNLABELED; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBPKEY]; while (c) { if (c->u.ibpkey.low_pkey <= pkey_num && c->u.ibpkey.high_pkey >= pkey_num && c->u.ibpkey.subnet_prefix == subnet_prefix) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *out_sid = SECINITSID_UNLABELED; out: rcu_read_unlock(); return rc; } /** * security_ib_endport_sid - Obtain the SID for a subnet management interface. * @dev_name: device name * @port: port number * @out_sid: security identifier */ int security_ib_endport_sid(struct selinux_state *state, const char *dev_name, u8 port_num, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct ocontext *c; int rc; if (!selinux_initialized(state)) { *out_sid = SECINITSID_UNLABELED; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_IBENDPORT]; while (c) { if (c->u.ibendport.port == port_num && !strncmp(c->u.ibendport.dev_name, dev_name, IB_DEVICE_NAME_MAX)) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *out_sid = SECINITSID_UNLABELED; out: rcu_read_unlock(); return rc; } /** * security_netif_sid - Obtain the SID for a network interface. * @name: interface name * @if_sid: interface SID */ int security_netif_sid(struct selinux_state *state, char *name, u32 *if_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; if (!selinux_initialized(state)) { *if_sid = SECINITSID_NETIF; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_NETIF]; while (c) { if (strcmp(name, c->u.name) == 0) break; c = c->next; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, if_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else *if_sid = SECINITSID_NETIF; out: rcu_read_unlock(); return rc; } static int match_ipv6_addrmask(u32 *input, u32 *addr, u32 *mask) { int i, fail = 0; for (i = 0; i < 4; i++) if (addr[i] != (input[i] & mask[i])) { fail = 1; break; } return !fail; } /** * security_node_sid - Obtain the SID for a node (host). * @domain: communication domain aka address family * @addrp: address * @addrlen: address length in bytes * @out_sid: security identifier */ int security_node_sid(struct selinux_state *state, u16 domain, void *addrp, u32 addrlen, u32 *out_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; if (!selinux_initialized(state)) { *out_sid = SECINITSID_NODE; return 0; } retry: rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; switch (domain) { case AF_INET: { u32 addr; rc = -EINVAL; if (addrlen != sizeof(u32)) goto out; addr = *((u32 *)addrp); c = policydb->ocontexts[OCON_NODE]; while (c) { if (c->u.node.addr == (addr & c->u.node.mask)) break; c = c->next; } break; } case AF_INET6: rc = -EINVAL; if (addrlen != sizeof(u64) * 2) goto out; c = policydb->ocontexts[OCON_NODE6]; while (c) { if (match_ipv6_addrmask(addrp, c->u.node6.addr, c->u.node6.mask)) break; c = c->next; } break; default: rc = 0; *out_sid = SECINITSID_NODE; goto out; } if (c) { rc = ocontext_to_sid(sidtab, c, 0, out_sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { *out_sid = SECINITSID_NODE; } rc = 0; out: rcu_read_unlock(); return rc; } #define SIDS_NEL 25 /** * security_get_user_sids - Obtain reachable SIDs for a user. * @fromsid: starting SID * @username: username * @sids: array of reachable SIDs for user * @nel: number of elements in @sids * * Generate the set of SIDs for legal security contexts * for a given user that can be reached by @fromsid. * Set *@sids to point to a dynamically allocated * array containing the set of SIDs. Set *@nel to the * number of elements in the array. */ int security_get_user_sids(struct selinux_state *state, u32 fromsid, char *username, u32 **sids, u32 *nel) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *fromcon, usercon; u32 *mysids = NULL, *mysids2, sid; u32 i, j, mynel, maxnel = SIDS_NEL; struct user_datum *user; struct role_datum *role; struct ebitmap_node *rnode, *tnode; int rc; *sids = NULL; *nel = 0; if (!selinux_initialized(state)) return 0; mysids = kcalloc(maxnel, sizeof(*mysids), GFP_KERNEL); if (!mysids) return -ENOMEM; retry: mynel = 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; context_init(&usercon); rc = -EINVAL; fromcon = sidtab_search(sidtab, fromsid); if (!fromcon) goto out_unlock; rc = -EINVAL; user = symtab_search(&policydb->p_users, username); if (!user) goto out_unlock; usercon.user = user->value; ebitmap_for_each_positive_bit(&user->roles, rnode, i) { role = policydb->role_val_to_struct[i]; usercon.role = i + 1; ebitmap_for_each_positive_bit(&role->types, tnode, j) { usercon.type = j + 1; if (mls_setup_user_range(policydb, fromcon, user, &usercon)) continue; rc = sidtab_context_to_sid(sidtab, &usercon, &sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out_unlock; if (mynel < maxnel) { mysids[mynel++] = sid; } else { rc = -ENOMEM; maxnel += SIDS_NEL; mysids2 = kcalloc(maxnel, sizeof(*mysids2), GFP_ATOMIC); if (!mysids2) goto out_unlock; memcpy(mysids2, mysids, mynel * sizeof(*mysids2)); kfree(mysids); mysids = mysids2; mysids[mynel++] = sid; } } } rc = 0; out_unlock: rcu_read_unlock(); if (rc || !mynel) { kfree(mysids); return rc; } rc = -ENOMEM; mysids2 = kcalloc(mynel, sizeof(*mysids2), GFP_KERNEL); if (!mysids2) { kfree(mysids); return rc; } for (i = 0, j = 0; i < mynel; i++) { struct av_decision dummy_avd; rc = avc_has_perm_noaudit(state, fromsid, mysids[i], SECCLASS_PROCESS, /* kernel value */ PROCESS__TRANSITION, AVC_STRICT, &dummy_avd); if (!rc) mysids2[j++] = mysids[i]; cond_resched(); } kfree(mysids); *sids = mysids2; *nel = j; return 0; } /** * __security_genfs_sid - Helper to obtain a SID for a file in a filesystem * @fstype: filesystem type * @path: path from root of mount * @sclass: file security class * @sid: SID for path * * Obtain a SID to use for a file in a filesystem that * cannot support xattr or use a fixed labeling behavior like * transition SIDs or task SIDs. * * WARNING: This function may return -ESTALE, indicating that the caller * must retry the operation after re-acquiring the policy pointer! */ static inline int __security_genfs_sid(struct selinux_policy *policy, const char *fstype, char *path, u16 orig_sclass, u32 *sid) { struct policydb *policydb = &policy->policydb; struct sidtab *sidtab = policy->sidtab; int len; u16 sclass; struct genfs *genfs; struct ocontext *c; int cmp = 0; while (path[0] == '/' && path[1] == '/') path++; sclass = unmap_class(&policy->map, orig_sclass); *sid = SECINITSID_UNLABELED; for (genfs = policydb->genfs; genfs; genfs = genfs->next) { cmp = strcmp(fstype, genfs->fstype); if (cmp <= 0) break; } if (!genfs || cmp) return -ENOENT; for (c = genfs->head; c; c = c->next) { len = strlen(c->u.name); if ((!c->v.sclass || sclass == c->v.sclass) && (strncmp(c->u.name, path, len) == 0)) break; } if (!c) return -ENOENT; return ocontext_to_sid(sidtab, c, 0, sid); } /** * security_genfs_sid - Obtain a SID for a file in a filesystem * @fstype: filesystem type * @path: path from root of mount * @sclass: file security class * @sid: SID for path * * Acquire policy_rwlock before calling __security_genfs_sid() and release * it afterward. */ int security_genfs_sid(struct selinux_state *state, const char *fstype, char *path, u16 orig_sclass, u32 *sid) { struct selinux_policy *policy; int retval; if (!selinux_initialized(state)) { *sid = SECINITSID_UNLABELED; return 0; } do { rcu_read_lock(); policy = rcu_dereference(state->policy); retval = __security_genfs_sid(policy, fstype, path, orig_sclass, sid); rcu_read_unlock(); } while (retval == -ESTALE); return retval; } int selinux_policy_genfs_sid(struct selinux_policy *policy, const char *fstype, char *path, u16 orig_sclass, u32 *sid) { /* no lock required, policy is not yet accessible by other threads */ return __security_genfs_sid(policy, fstype, path, orig_sclass, sid); } /** * security_fs_use - Determine how to handle labeling for a filesystem. * @sb: superblock in question */ int security_fs_use(struct selinux_state *state, struct super_block *sb) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct ocontext *c; struct superblock_security_struct *sbsec = sb->s_security; const char *fstype = sb->s_type->name; if (!selinux_initialized(state)) { sbsec->behavior = SECURITY_FS_USE_NONE; sbsec->sid = SECINITSID_UNLABELED; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; c = policydb->ocontexts[OCON_FSUSE]; while (c) { if (strcmp(fstype, c->u.name) == 0) break; c = c->next; } if (c) { sbsec->behavior = c->v.behavior; rc = ocontext_to_sid(sidtab, c, 0, &sbsec->sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; } else { rc = __security_genfs_sid(policy, fstype, "/", SECCLASS_DIR, &sbsec->sid); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) { sbsec->behavior = SECURITY_FS_USE_NONE; rc = 0; } else { sbsec->behavior = SECURITY_FS_USE_GENFS; } } out: rcu_read_unlock(); return rc; } int security_get_bools(struct selinux_policy *policy, u32 *len, char ***names, int **values) { struct policydb *policydb; u32 i; int rc; policydb = &policy->policydb; *names = NULL; *values = NULL; rc = 0; *len = policydb->p_bools.nprim; if (!*len) goto out; rc = -ENOMEM; *names = kcalloc(*len, sizeof(char *), GFP_ATOMIC); if (!*names) goto err; rc = -ENOMEM; *values = kcalloc(*len, sizeof(int), GFP_ATOMIC); if (!*values) goto err; for (i = 0; i < *len; i++) { (*values)[i] = policydb->bool_val_to_struct[i]->state; rc = -ENOMEM; (*names)[i] = kstrdup(sym_name(policydb, SYM_BOOLS, i), GFP_ATOMIC); if (!(*names)[i]) goto err; } rc = 0; out: return rc; err: if (*names) { for (i = 0; i < *len; i++) kfree((*names)[i]); kfree(*names); } kfree(*values); *len = 0; *names = NULL; *values = NULL; goto out; } int security_set_bools(struct selinux_state *state, u32 len, int *values) { struct selinux_policy *newpolicy, *oldpolicy; int rc; u32 i, seqno = 0; if (!selinux_initialized(state)) return -EINVAL; oldpolicy = rcu_dereference_protected(state->policy, lockdep_is_held(&state->policy_mutex)); /* Consistency check on number of booleans, should never fail */ if (WARN_ON(len != oldpolicy->policydb.p_bools.nprim)) return -EINVAL; newpolicy = kmemdup(oldpolicy, sizeof(*newpolicy), GFP_KERNEL); if (!newpolicy) return -ENOMEM; /* * Deep copy only the parts of the policydb that might be * modified as a result of changing booleans. */ rc = cond_policydb_dup(&newpolicy->policydb, &oldpolicy->policydb); if (rc) { kfree(newpolicy); return -ENOMEM; } /* Update the boolean states in the copy */ for (i = 0; i < len; i++) { int new_state = !!values[i]; int old_state = newpolicy->policydb.bool_val_to_struct[i]->state; if (new_state != old_state) { audit_log(audit_context(), GFP_ATOMIC, AUDIT_MAC_CONFIG_CHANGE, "bool=%s val=%d old_val=%d auid=%u ses=%u", sym_name(&newpolicy->policydb, SYM_BOOLS, i), new_state, old_state, from_kuid(&init_user_ns, audit_get_loginuid(current)), audit_get_sessionid(current)); newpolicy->policydb.bool_val_to_struct[i]->state = new_state; } } /* Re-evaluate the conditional rules in the copy */ evaluate_cond_nodes(&newpolicy->policydb); /* Set latest granting seqno for new policy */ newpolicy->latest_granting = oldpolicy->latest_granting + 1; seqno = newpolicy->latest_granting; /* Install the new policy */ rcu_assign_pointer(state->policy, newpolicy); /* * Free the conditional portions of the old policydb * that were copied for the new policy, and the oldpolicy * structure itself but not what it references. */ synchronize_rcu(); selinux_policy_cond_free(oldpolicy); /* Notify others of the policy change */ selinux_notify_policy_change(state, seqno); return 0; } int security_get_bool_value(struct selinux_state *state, u32 index) { struct selinux_policy *policy; struct policydb *policydb; int rc; u32 len; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; rc = -EFAULT; len = policydb->p_bools.nprim; if (index >= len) goto out; rc = policydb->bool_val_to_struct[index]->state; out: rcu_read_unlock(); return rc; } static int security_preserve_bools(struct selinux_policy *oldpolicy, struct selinux_policy *newpolicy) { int rc, *bvalues = NULL; char **bnames = NULL; struct cond_bool_datum *booldatum; u32 i, nbools = 0; rc = security_get_bools(oldpolicy, &nbools, &bnames, &bvalues); if (rc) goto out; for (i = 0; i < nbools; i++) { booldatum = symtab_search(&newpolicy->policydb.p_bools, bnames[i]); if (booldatum) booldatum->state = bvalues[i]; } evaluate_cond_nodes(&newpolicy->policydb); out: if (bnames) { for (i = 0; i < nbools; i++) kfree(bnames[i]); } kfree(bnames); kfree(bvalues); return rc; } /* * security_sid_mls_copy() - computes a new sid based on the given * sid and the mls portion of mls_sid. */ int security_sid_mls_copy(struct selinux_state *state, u32 sid, u32 mls_sid, u32 *new_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; struct context *context1; struct context *context2; struct context newcon; char *s; u32 len; int rc; if (!selinux_initialized(state)) { *new_sid = sid; return 0; } retry: rc = 0; context_init(&newcon); rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (!policydb->mls_enabled) { *new_sid = sid; goto out_unlock; } rc = -EINVAL; context1 = sidtab_search(sidtab, sid); if (!context1) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, sid); goto out_unlock; } rc = -EINVAL; context2 = sidtab_search(sidtab, mls_sid); if (!context2) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, mls_sid); goto out_unlock; } newcon.user = context1->user; newcon.role = context1->role; newcon.type = context1->type; rc = mls_context_cpy(&newcon, context2); if (rc) goto out_unlock; /* Check the validity of the new context. */ if (!policydb_context_isvalid(policydb, &newcon)) { rc = convert_context_handle_invalid_context(state, policydb, &newcon); if (rc) { if (!context_struct_to_string(policydb, &newcon, &s, &len)) { struct audit_buffer *ab; ab = audit_log_start(audit_context(), GFP_ATOMIC, AUDIT_SELINUX_ERR); audit_log_format(ab, "op=security_sid_mls_copy invalid_context="); /* don't record NUL with untrusted strings */ audit_log_n_untrustedstring(ab, s, len - 1); audit_log_end(ab); kfree(s); } goto out_unlock; } } rc = sidtab_context_to_sid(sidtab, &newcon, new_sid); if (rc == -ESTALE) { rcu_read_unlock(); context_destroy(&newcon); goto retry; } out_unlock: rcu_read_unlock(); context_destroy(&newcon); return rc; } /** * security_net_peersid_resolve - Compare and resolve two network peer SIDs * @nlbl_sid: NetLabel SID * @nlbl_type: NetLabel labeling protocol type * @xfrm_sid: XFRM SID * * Description: * Compare the @nlbl_sid and @xfrm_sid values and if the two SIDs can be * resolved into a single SID it is returned via @peer_sid and the function * returns zero. Otherwise @peer_sid is set to SECSID_NULL and the function * returns a negative value. A table summarizing the behavior is below: * * | function return | @sid * ------------------------------+-----------------+----------------- * no peer labels | 0 | SECSID_NULL * single peer label | 0 | <peer_label> * multiple, consistent labels | 0 | <peer_label> * multiple, inconsistent labels | -<errno> | SECSID_NULL * */ int security_net_peersid_resolve(struct selinux_state *state, u32 nlbl_sid, u32 nlbl_type, u32 xfrm_sid, u32 *peer_sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct context *nlbl_ctx; struct context *xfrm_ctx; *peer_sid = SECSID_NULL; /* handle the common (which also happens to be the set of easy) cases * right away, these two if statements catch everything involving a * single or absent peer SID/label */ if (xfrm_sid == SECSID_NULL) { *peer_sid = nlbl_sid; return 0; } /* NOTE: an nlbl_type == NETLBL_NLTYPE_UNLABELED is a "fallback" label * and is treated as if nlbl_sid == SECSID_NULL when a XFRM SID/label * is present */ if (nlbl_sid == SECSID_NULL || nlbl_type == NETLBL_NLTYPE_UNLABELED) { *peer_sid = xfrm_sid; return 0; } if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; /* * We don't need to check initialized here since the only way both * nlbl_sid and xfrm_sid are not equal to SECSID_NULL would be if the * security server was initialized and state->initialized was true. */ if (!policydb->mls_enabled) { rc = 0; goto out; } rc = -EINVAL; nlbl_ctx = sidtab_search(sidtab, nlbl_sid); if (!nlbl_ctx) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, nlbl_sid); goto out; } rc = -EINVAL; xfrm_ctx = sidtab_search(sidtab, xfrm_sid); if (!xfrm_ctx) { pr_err("SELinux: %s: unrecognized SID %d\n", __func__, xfrm_sid); goto out; } rc = (mls_context_cmp(nlbl_ctx, xfrm_ctx) ? 0 : -EACCES); if (rc) goto out; /* at present NetLabel SIDs/labels really only carry MLS * information so if the MLS portion of the NetLabel SID * matches the MLS portion of the labeled XFRM SID/label * then pass along the XFRM SID as it is the most * expressive */ *peer_sid = xfrm_sid; out: rcu_read_unlock(); return rc; } static int get_classes_callback(void *k, void *d, void *args) { struct class_datum *datum = d; char *name = k, **classes = args; int value = datum->value - 1; classes[value] = kstrdup(name, GFP_ATOMIC); if (!classes[value]) return -ENOMEM; return 0; } int security_get_classes(struct selinux_policy *policy, char ***classes, int *nclasses) { struct policydb *policydb; int rc; policydb = &policy->policydb; rc = -ENOMEM; *nclasses = policydb->p_classes.nprim; *classes = kcalloc(*nclasses, sizeof(**classes), GFP_ATOMIC); if (!*classes) goto out; rc = hashtab_map(&policydb->p_classes.table, get_classes_callback, *classes); if (rc) { int i; for (i = 0; i < *nclasses; i++) kfree((*classes)[i]); kfree(*classes); } out: return rc; } static int get_permissions_callback(void *k, void *d, void *args) { struct perm_datum *datum = d; char *name = k, **perms = args; int value = datum->value - 1; perms[value] = kstrdup(name, GFP_ATOMIC); if (!perms[value]) return -ENOMEM; return 0; } int security_get_permissions(struct selinux_policy *policy, char *class, char ***perms, int *nperms) { struct policydb *policydb; int rc, i; struct class_datum *match; policydb = &policy->policydb; rc = -EINVAL; match = symtab_search(&policydb->p_classes, class); if (!match) { pr_err("SELinux: %s: unrecognized class %s\n", __func__, class); goto out; } rc = -ENOMEM; *nperms = match->permissions.nprim; *perms = kcalloc(*nperms, sizeof(**perms), GFP_ATOMIC); if (!*perms) goto out; if (match->comdatum) { rc = hashtab_map(&match->comdatum->permissions.table, get_permissions_callback, *perms); if (rc) goto err; } rc = hashtab_map(&match->permissions.table, get_permissions_callback, *perms); if (rc) goto err; out: return rc; err: for (i = 0; i < *nperms; i++) kfree((*perms)[i]); kfree(*perms); return rc; } int security_get_reject_unknown(struct selinux_state *state) { struct selinux_policy *policy; int value; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); value = policy->policydb.reject_unknown; rcu_read_unlock(); return value; } int security_get_allow_unknown(struct selinux_state *state) { struct selinux_policy *policy; int value; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); value = policy->policydb.allow_unknown; rcu_read_unlock(); return value; } /** * security_policycap_supported - Check for a specific policy capability * @req_cap: capability * * Description: * This function queries the currently loaded policy to see if it supports the * capability specified by @req_cap. Returns true (1) if the capability is * supported, false (0) if it isn't supported. * */ int security_policycap_supported(struct selinux_state *state, unsigned int req_cap) { struct selinux_policy *policy; int rc; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); rc = ebitmap_get_bit(&policy->policydb.policycaps, req_cap); rcu_read_unlock(); return rc; } struct selinux_audit_rule { u32 au_seqno; struct context au_ctxt; }; void selinux_audit_rule_free(void *vrule) { struct selinux_audit_rule *rule = vrule; if (rule) { context_destroy(&rule->au_ctxt); kfree(rule); } } int selinux_audit_rule_init(u32 field, u32 op, char *rulestr, void **vrule) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; struct policydb *policydb; struct selinux_audit_rule *tmprule; struct role_datum *roledatum; struct type_datum *typedatum; struct user_datum *userdatum; struct selinux_audit_rule **rule = (struct selinux_audit_rule **)vrule; int rc = 0; *rule = NULL; if (!selinux_initialized(state)) return -EOPNOTSUPP; switch (field) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: /* only 'equals' and 'not equals' fit user, role, and type */ if (op != Audit_equal && op != Audit_not_equal) return -EINVAL; break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: /* we do not allow a range, indicated by the presence of '-' */ if (strchr(rulestr, '-')) return -EINVAL; break; default: /* only the above fields are valid */ return -EINVAL; } tmprule = kzalloc(sizeof(struct selinux_audit_rule), GFP_KERNEL); if (!tmprule) return -ENOMEM; context_init(&tmprule->au_ctxt); rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; tmprule->au_seqno = policy->latest_granting; switch (field) { case AUDIT_SUBJ_USER: case AUDIT_OBJ_USER: rc = -EINVAL; userdatum = symtab_search(&policydb->p_users, rulestr); if (!userdatum) goto out; tmprule->au_ctxt.user = userdatum->value; break; case AUDIT_SUBJ_ROLE: case AUDIT_OBJ_ROLE: rc = -EINVAL; roledatum = symtab_search(&policydb->p_roles, rulestr); if (!roledatum) goto out; tmprule->au_ctxt.role = roledatum->value; break; case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_TYPE: rc = -EINVAL; typedatum = symtab_search(&policydb->p_types, rulestr); if (!typedatum) goto out; tmprule->au_ctxt.type = typedatum->value; break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: rc = mls_from_string(policydb, rulestr, &tmprule->au_ctxt, GFP_ATOMIC); if (rc) goto out; break; } rc = 0; out: rcu_read_unlock(); if (rc) { selinux_audit_rule_free(tmprule); tmprule = NULL; } *rule = tmprule; return rc; } /* Check to see if the rule contains any selinux fields */ int selinux_audit_rule_known(struct audit_krule *rule) { int i; for (i = 0; i < rule->field_count; i++) { struct audit_field *f = &rule->fields[i]; switch (f->type) { case AUDIT_SUBJ_USER: case AUDIT_SUBJ_ROLE: case AUDIT_SUBJ_TYPE: case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_USER: case AUDIT_OBJ_ROLE: case AUDIT_OBJ_TYPE: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: return 1; } } return 0; } int selinux_audit_rule_match(u32 sid, u32 field, u32 op, void *vrule) { struct selinux_state *state = &selinux_state; struct selinux_policy *policy; struct context *ctxt; struct mls_level *level; struct selinux_audit_rule *rule = vrule; int match = 0; if (unlikely(!rule)) { WARN_ONCE(1, "selinux_audit_rule_match: missing rule\n"); return -ENOENT; } if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); if (rule->au_seqno < policy->latest_granting) { match = -ESTALE; goto out; } ctxt = sidtab_search(policy->sidtab, sid); if (unlikely(!ctxt)) { WARN_ONCE(1, "selinux_audit_rule_match: unrecognized SID %d\n", sid); match = -ENOENT; goto out; } /* a field/op pair that is not caught here will simply fall through without a match */ switch (field) { case AUDIT_SUBJ_USER: case AUDIT_OBJ_USER: switch (op) { case Audit_equal: match = (ctxt->user == rule->au_ctxt.user); break; case Audit_not_equal: match = (ctxt->user != rule->au_ctxt.user); break; } break; case AUDIT_SUBJ_ROLE: case AUDIT_OBJ_ROLE: switch (op) { case Audit_equal: match = (ctxt->role == rule->au_ctxt.role); break; case Audit_not_equal: match = (ctxt->role != rule->au_ctxt.role); break; } break; case AUDIT_SUBJ_TYPE: case AUDIT_OBJ_TYPE: switch (op) { case Audit_equal: match = (ctxt->type == rule->au_ctxt.type); break; case Audit_not_equal: match = (ctxt->type != rule->au_ctxt.type); break; } break; case AUDIT_SUBJ_SEN: case AUDIT_SUBJ_CLR: case AUDIT_OBJ_LEV_LOW: case AUDIT_OBJ_LEV_HIGH: level = ((field == AUDIT_SUBJ_SEN || field == AUDIT_OBJ_LEV_LOW) ? &ctxt->range.level[0] : &ctxt->range.level[1]); switch (op) { case Audit_equal: match = mls_level_eq(&rule->au_ctxt.range.level[0], level); break; case Audit_not_equal: match = !mls_level_eq(&rule->au_ctxt.range.level[0], level); break; case Audit_lt: match = (mls_level_dom(&rule->au_ctxt.range.level[0], level) && !mls_level_eq(&rule->au_ctxt.range.level[0], level)); break; case Audit_le: match = mls_level_dom(&rule->au_ctxt.range.level[0], level); break; case Audit_gt: match = (mls_level_dom(level, &rule->au_ctxt.range.level[0]) && !mls_level_eq(level, &rule->au_ctxt.range.level[0])); break; case Audit_ge: match = mls_level_dom(level, &rule->au_ctxt.range.level[0]); break; } } out: rcu_read_unlock(); return match; } static int (*aurule_callback)(void) = audit_update_lsm_rules; static int aurule_avc_callback(u32 event) { int err = 0; if (event == AVC_CALLBACK_RESET && aurule_callback) err = aurule_callback(); return err; } static int __init aurule_init(void) { int err; err = avc_add_callback(aurule_avc_callback, AVC_CALLBACK_RESET); if (err) panic("avc_add_callback() failed, error %d\n", err); return err; } __initcall(aurule_init); #ifdef CONFIG_NETLABEL /** * security_netlbl_cache_add - Add an entry to the NetLabel cache * @secattr: the NetLabel packet security attributes * @sid: the SELinux SID * * Description: * Attempt to cache the context in @ctx, which was derived from the packet in * @skb, in the NetLabel subsystem cache. This function assumes @secattr has * already been initialized. * */ static void security_netlbl_cache_add(struct netlbl_lsm_secattr *secattr, u32 sid) { u32 *sid_cache; sid_cache = kmalloc(sizeof(*sid_cache), GFP_ATOMIC); if (sid_cache == NULL) return; secattr->cache = netlbl_secattr_cache_alloc(GFP_ATOMIC); if (secattr->cache == NULL) { kfree(sid_cache); return; } *sid_cache = sid; secattr->cache->free = kfree; secattr->cache->data = sid_cache; secattr->flags |= NETLBL_SECATTR_CACHE; } /** * security_netlbl_secattr_to_sid - Convert a NetLabel secattr to a SELinux SID * @secattr: the NetLabel packet security attributes * @sid: the SELinux SID * * Description: * Convert the given NetLabel security attributes in @secattr into a * SELinux SID. If the @secattr field does not contain a full SELinux * SID/context then use SECINITSID_NETMSG as the foundation. If possible the * 'cache' field of @secattr is set and the CACHE flag is set; this is to * allow the @secattr to be used by NetLabel to cache the secattr to SID * conversion for future lookups. Returns zero on success, negative values on * failure. * */ int security_netlbl_secattr_to_sid(struct selinux_state *state, struct netlbl_lsm_secattr *secattr, u32 *sid) { struct selinux_policy *policy; struct policydb *policydb; struct sidtab *sidtab; int rc; struct context *ctx; struct context ctx_new; if (!selinux_initialized(state)) { *sid = SECSID_NULL; return 0; } retry: rc = 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; sidtab = policy->sidtab; if (secattr->flags & NETLBL_SECATTR_CACHE) *sid = *(u32 *)secattr->cache->data; else if (secattr->flags & NETLBL_SECATTR_SECID) *sid = secattr->attr.secid; else if (secattr->flags & NETLBL_SECATTR_MLS_LVL) { rc = -EIDRM; ctx = sidtab_search(sidtab, SECINITSID_NETMSG); if (ctx == NULL) goto out; context_init(&ctx_new); ctx_new.user = ctx->user; ctx_new.role = ctx->role; ctx_new.type = ctx->type; mls_import_netlbl_lvl(policydb, &ctx_new, secattr); if (secattr->flags & NETLBL_SECATTR_MLS_CAT) { rc = mls_import_netlbl_cat(policydb, &ctx_new, secattr); if (rc) goto out; } rc = -EIDRM; if (!mls_context_isvalid(policydb, &ctx_new)) { ebitmap_destroy(&ctx_new.range.level[0].cat); goto out; } rc = sidtab_context_to_sid(sidtab, &ctx_new, sid); ebitmap_destroy(&ctx_new.range.level[0].cat); if (rc == -ESTALE) { rcu_read_unlock(); goto retry; } if (rc) goto out; security_netlbl_cache_add(secattr, *sid); } else *sid = SECSID_NULL; out: rcu_read_unlock(); return rc; } /** * security_netlbl_sid_to_secattr - Convert a SELinux SID to a NetLabel secattr * @sid: the SELinux SID * @secattr: the NetLabel packet security attributes * * Description: * Convert the given SELinux SID in @sid into a NetLabel security attribute. * Returns zero on success, negative values on failure. * */ int security_netlbl_sid_to_secattr(struct selinux_state *state, u32 sid, struct netlbl_lsm_secattr *secattr) { struct selinux_policy *policy; struct policydb *policydb; int rc; struct context *ctx; if (!selinux_initialized(state)) return 0; rcu_read_lock(); policy = rcu_dereference(state->policy); policydb = &policy->policydb; rc = -ENOENT; ctx = sidtab_search(policy->sidtab, sid); if (ctx == NULL) goto out; rc = -ENOMEM; secattr->domain = kstrdup(sym_name(policydb, SYM_TYPES, ctx->type - 1), GFP_ATOMIC); if (secattr->domain == NULL) goto out; secattr->attr.secid = sid; secattr->flags |= NETLBL_SECATTR_DOMAIN_CPY | NETLBL_SECATTR_SECID; mls_export_netlbl_lvl(policydb, ctx, secattr); rc = mls_export_netlbl_cat(policydb, ctx, secattr); out: rcu_read_unlock(); return rc; } #endif /* CONFIG_NETLABEL */ /** * security_read_policy - read the policy. * @data: binary policy data * @len: length of data in bytes * */ int security_read_policy(struct selinux_state *state, void **data, size_t *len) { struct selinux_policy *policy; int rc; struct policy_file fp; policy = rcu_dereference_protected( state->policy, lockdep_is_held(&state->policy_mutex)); if (!policy) return -EINVAL; *len = policy->policydb.len; *data = vmalloc_user(*len); if (!*data) return -ENOMEM; fp.data = *data; fp.len = *len; rc = policydb_write(&policy->policydb, &fp); if (rc) return rc; *len = (unsigned long)fp.data - (unsigned long)*data; return 0; }
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER #define MAX_ORDER 11 #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. What is important though * is that a range of pageblocks must be aligned to * MAX_ORDER_NR_PAGES should biggest page be bigger then * a single pageblock. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { return list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); } static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } struct pglist_data; /* * zone->lock and the zone lru_lock are two of the hottest locks in the kernel. * So add a wild amount of padding here to ensure that they fall into separate * cachelines. There are very few zone structures in the machine, so space * consumption is not a concern here. */ #if defined(CONFIG_SMP) struct zone_padding { char x[0]; } ____cacheline_internodealigned_in_smp; #define ZONE_PADDING(name) struct zone_padding name; #else #define ZONE_PADDING(name) #endif #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_STAT_ITEMS }; #else #define NR_VM_NUMA_STAT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_PAGETABLE, /* used for pagetables */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define ANON_AND_FILE 2 enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI */ }; struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif }; /* Isolate unmapped pages */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, NR_WMARK }; #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES]; }; struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Memory offlining might * retry a long time. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NEED_MULTIPLE_NODES int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/end(). Any reader who can't tolerant drift of * present_pages should get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ ZONE_PADDING(_pad2_) /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) { return zone->spanned_pages == 0; } /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; #ifndef CONFIG_DISCONTIGMEM /* The array of struct pages - for discontigmem use pgdat->lmem_map */ extern struct page *mem_map; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; ZONE_PADDING(_pad2_) /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #ifdef CONFIG_FLAT_NODE_MEM_MAP #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) #else #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) #endif #define nid_page_nr(nid, pagenr) pgdat_page_nr(NODE_DATA(nid),(pagenr)) #define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) #define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid)) static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) { return pgdat->node_start_pfn + pgdat->node_spanned_pages; } static inline bool pgdat_is_empty(pg_data_t *pgdat) { return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; } #include <linux/memory_hotplug.h> void build_all_zonelists(pg_data_t *pgdat); void wakeup_kswapd(struct zone *zone, gfp_t gfp_mask, int order, enum zone_type highest_zoneidx); bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags, long free_pages); bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. */ enum meminit_context { MEMINIT_EARLY, MEMINIT_HOTPLUG, }; extern void init_currently_empty_zone(struct zone *zone, unsigned long start_pfn, unsigned long size); extern void lruvec_init(struct lruvec *lruvec); static inline struct pglist_data *lruvec_pgdat(struct lruvec *lruvec) { #ifdef CONFIG_MEMCG return lruvec->pgdat; #else return container_of(lruvec, struct pglist_data, __lruvec); #endif } extern unsigned long lruvec_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx); #ifdef CONFIG_HAVE_MEMORYLESS_NODES int local_memory_node(int node_id); #else static inline int local_memory_node(int node_id) { return node_id; }; #endif /* * zone_idx() returns 0 for the ZONE_DMA zone, 1 for the ZONE_NORMAL zone, etc. */ #define zone_idx(zone) ((zone) - (zone)->zone_pgdat->node_zones) /* * Returns true if a zone has pages managed by the buddy allocator. * All the reclaim decisions have to use this function rather than * populated_zone(). If the whole zone is reserved then we can easily * end up with populated_zone() && !managed_zone(). */ static inline bool managed_zone(struct zone *zone) { return zone_managed_pages(zone); } /* Returns true if a zone has memory */ static inline bool populated_zone(struct zone *zone) { return zone->present_pages; } #ifdef CONFIG_NEED_MULTIPLE_NODES static inline int zone_to_nid(struct zone *zone) { return zone->node; } static inline void zone_set_nid(struct zone *zone, int nid) { zone->node = nid; } #else static inline int zone_to_nid(struct zone *zone) { return 0; } static inline void zone_set_nid(struct zone *zone, int nid) {} #endif extern int movable_zone; #ifdef CONFIG_HIGHMEM static inline int zone_movable_is_highmem(void) { #ifdef CONFIG_NEED_MULTIPLE_NODES return movable_zone == ZONE_HIGHMEM; #else return (ZONE_MOVABLE - 1) == ZONE_HIGHMEM; #endif } #endif static inline int is_highmem_idx(enum zone_type idx) { #ifdef CONFIG_HIGHMEM return (idx == ZONE_HIGHMEM || (idx == ZONE_MOVABLE && zone_movable_is_highmem())); #else return 0; #endif } /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. * @zone - pointer to struct zone variable */ static inline int is_highmem(struct zone *zone) { #ifdef CONFIG_HIGHMEM return is_highmem_idx(zone_idx(zone)); #else return 0; #endif } /* These two functions are used to setup the per zone pages min values */ struct ctl_table; int min_free_kbytes_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int watermark_scale_factor_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); extern int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES]; int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int numa_zonelist_order_handler(struct ctl_table *, int, void *, size_t *, loff_t *); extern int percpu_pagelist_fraction; extern char numa_zonelist_order[]; #define NUMA_ZONELIST_ORDER_LEN 16 #ifndef CONFIG_NEED_MULTIPLE_NODES extern struct pglist_data contig_page_data; #define NODE_DATA(nid) (&contig_page_data) #define NODE_MEM_MAP(nid) mem_map #else /* CONFIG_NEED_MULTIPLE_NODES */ #include <asm/mmzone.h> #endif /* !CONFIG_NEED_MULTIPLE_NODES */ extern struct pglist_data *first_online_pgdat(void); extern struct pglist_data *next_online_pgdat(struct pglist_data *pgdat); extern struct zone *next_zone(struct zone *zone); /** * for_each_online_pgdat - helper macro to iterate over all online nodes * @pgdat - pointer to a pg_data_t variable */ #define for_each_online_pgdat(pgdat) \ for (pgdat = first_online_pgdat(); \ pgdat; \ pgdat = next_online_pgdat(pgdat)) /** * for_each_zone - helper macro to iterate over all memory zones * @zone - pointer to struct zone variable * * The user only needs to declare the zone variable, for_each_zone * fills it in. */ #define for_each_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) #define for_each_populated_zone(zone) \ for (zone = (first_online_pgdat())->node_zones; \ zone; \ zone = next_zone(zone)) \ if (!populated_zone(zone)) \ ; /* do nothing */ \ else static inline struct zone *zonelist_zone(struct zoneref *zoneref) { return zoneref->zone; } static inline int zonelist_zone_idx(struct zoneref *zoneref) { return zoneref->zone_idx; } static inline int zonelist_node_idx(struct zoneref *zoneref) { return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes); /** * next_zones_zonelist - Returns the next zone at or below highest_zoneidx within the allowed nodemask using a cursor within a zonelist as a starting point * @z - The cursor used as a starting point for the search * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with * * This function returns the next zone at or below a given zone index that is * within the allowed nodemask using a cursor as the starting point for the * search. The zoneref returned is a cursor that represents the current zone * being examined. It should be advanced by one before calling * next_zones_zonelist again. */ static __always_inline struct zoneref *next_zones_zonelist(struct zoneref *z, enum zone_type highest_zoneidx, nodemask_t *nodes) { if (likely(!nodes && zonelist_zone_idx(z) <= highest_zoneidx)) return z; return __next_zones_zonelist(z, highest_zoneidx, nodes); } /** * first_zones_zonelist - Returns the first zone at or below highest_zoneidx within the allowed nodemask in a zonelist * @zonelist - The zonelist to search for a suitable zone * @highest_zoneidx - The zone index of the highest zone to return * @nodes - An optional nodemask to filter the zonelist with * @return - Zoneref pointer for the first suitable zone found (see below) * * This function returns the first zone at or below a given zone index that is * within the allowed nodemask. The zoneref returned is a cursor that can be * used to iterate the zonelist with next_zones_zonelist by advancing it by * one before calling. * * When no eligible zone is found, zoneref->zone is NULL (zoneref itself is * never NULL). This may happen either genuinely, or due to concurrent nodemask * update due to cpuset modification. */ static inline struct zoneref *first_zones_zonelist(struct zonelist *zonelist, enum zone_type highest_zoneidx, nodemask_t *nodes) { return next_zones_zonelist(zonelist->_zonerefs, highest_zoneidx, nodes); } /** * for_each_zone_zonelist_nodemask - helper macro to iterate over valid zones in a zonelist at or below a given zone index and within a nodemask * @zone - The current zone in the iterator * @z - The current pointer within zonelist->_zonerefs being iterated * @zlist - The zonelist being iterated * @highidx - The zone index of the highest zone to return * @nodemask - Nodemask allowed by the allocator * * This iterator iterates though all zones at or below a given zone index and * within a given nodemask */ #define for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, nodemask) \ for (z = first_zones_zonelist(zlist, highidx, nodemask), zone = zonelist_zone(z); \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) #define for_next_zone_zonelist_nodemask(zone, z, highidx, nodemask) \ for (zone = z->zone; \ zone; \ z = next_zones_zonelist(++z, highidx, nodemask), \ zone = zonelist_zone(z)) /** * for_each_zone_zonelist - helper macro to iterate over valid zones in a zonelist at or below a given zone index * @zone - The current zone in the iterator * @z - The current pointer within zonelist->zones being iterated * @zlist - The zonelist being iterated * @highidx - The zone index of the highest zone to return * * This iterator iterates though all zones at or below a given zone index. */ #define for_each_zone_zonelist(zone, z, zlist, highidx) \ for_each_zone_zonelist_nodemask(zone, z, zlist, highidx, NULL) #ifdef CONFIG_SPARSEMEM #include <asm/sparsemem.h> #endif #ifdef CONFIG_FLATMEM #define pfn_to_nid(pfn) (0) #endif #ifdef CONFIG_SPARSEMEM /* * SECTION_SHIFT #bits space required to store a section # * * PA_SECTION_SHIFT physical address to/from section number * PFN_SECTION_SHIFT pfn to/from section number */ #define PA_SECTION_SHIFT (SECTION_SIZE_BITS) #define PFN_SECTION_SHIFT (SECTION_SIZE_BITS - PAGE_SHIFT) #define NR_MEM_SECTIONS (1UL << SECTIONS_SHIFT) #define PAGES_PER_SECTION (1UL << PFN_SECTION_SHIFT) #define PAGE_SECTION_MASK (~(PAGES_PER_SECTION-1)) #define SECTION_BLOCKFLAGS_BITS \ ((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) #if (MAX_ORDER - 1 + PAGE_SHIFT) > SECTION_SIZE_BITS #error Allocator MAX_ORDER exceeds SECTION_SIZE #endif static inline unsigned long pfn_to_section_nr(unsigned long pfn) { return pfn >> PFN_SECTION_SHIFT; } static inline unsigned long section_nr_to_pfn(unsigned long sec) { return sec << PFN_SECTION_SHIFT; } #define SECTION_ALIGN_UP(pfn) (((pfn) + PAGES_PER_SECTION - 1) & PAGE_SECTION_MASK) #define SECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SECTION_MASK) #define SUBSECTION_SHIFT 21 #define SUBSECTION_SIZE (1UL << SUBSECTION_SHIFT) #define PFN_SUBSECTION_SHIFT (SUBSECTION_SHIFT - PAGE_SHIFT) #define PAGES_PER_SUBSECTION (1UL << PFN_SUBSECTION_SHIFT) #define PAGE_SUBSECTION_MASK (~(PAGES_PER_SUBSECTION-1)) #if SUBSECTION_SHIFT > SECTION_SIZE_BITS #error Subsection size exceeds section size #else #define SUBSECTIONS_PER_SECTION (1UL << (SECTION_SIZE_BITS - SUBSECTION_SHIFT)) #endif #define SUBSECTION_ALIGN_UP(pfn) ALIGN((pfn), PAGES_PER_SUBSECTION) #define SUBSECTION_ALIGN_DOWN(pfn) ((pfn) & PAGE_SUBSECTION_MASK) struct mem_section_usage { #ifdef CONFIG_SPARSEMEM_VMEMMAP DECLARE_BITMAP(subsection_map, SUBSECTIONS_PER_SECTION); #endif /* See declaration of similar field in struct zone */ unsigned long pageblock_flags[0]; }; void subsection_map_init(unsigned long pfn, unsigned long nr_pages); struct page; struct page_ext; struct mem_section { /* * This is, logically, a pointer to an array of struct * pages. However, it is stored with some other magic. * (see sparse.c::sparse_init_one_section()) * * Additionally during early boot we encode node id of * the location of the section here to guide allocation. * (see sparse.c::memory_present()) * * Making it a UL at least makes someone do a cast * before using it wrong. */ unsigned long section_mem_map; struct mem_section_usage *usage; #ifdef CONFIG_PAGE_EXTENSION /* * If SPARSEMEM, pgdat doesn't have page_ext pointer. We use * section. (see page_ext.h about this.) */ struct page_ext *page_ext; unsigned long pad; #endif /* * WARNING: mem_section must be a power-of-2 in size for the * calculation and use of SECTION_ROOT_MASK to make sense. */ }; #ifdef CONFIG_SPARSEMEM_EXTREME #define SECTIONS_PER_ROOT (PAGE_SIZE / sizeof (struct mem_section)) #else #define SECTIONS_PER_ROOT 1 #endif #define SECTION_NR_TO_ROOT(sec) ((sec) / SECTIONS_PER_ROOT) #define NR_SECTION_ROOTS DIV_ROUND_UP(NR_MEM_SECTIONS, SECTIONS_PER_ROOT) #define SECTION_ROOT_MASK (SECTIONS_PER_ROOT - 1) #ifdef CONFIG_SPARSEMEM_EXTREME extern struct mem_section **mem_section; #else extern struct mem_section mem_section[NR_SECTION_ROOTS][SECTIONS_PER_ROOT]; #endif static inline unsigned long *section_to_usemap(struct mem_section *ms) { return ms->usage->pageblock_flags; } static inline struct mem_section *__nr_to_section(unsigned long nr) { #ifdef CONFIG_SPARSEMEM_EXTREME if (!mem_section) return NULL; #endif if (!mem_section[SECTION_NR_TO_ROOT(nr)]) return NULL; return &mem_section[SECTION_NR_TO_ROOT(nr)][nr & SECTION_ROOT_MASK]; } extern unsigned long __section_nr(struct mem_section *ms); extern size_t mem_section_usage_size(void); /* * We use the lower bits of the mem_map pointer to store * a little bit of information. The pointer is calculated * as mem_map - section_nr_to_pfn(pnum). The result is * aligned to the minimum alignment of the two values: * 1. All mem_map arrays are page-aligned. * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT * lowest bits. PFN_SECTION_SHIFT is arch-specific * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the * worst combination is powerpc with 256k pages, * which results in PFN_SECTION_SHIFT equal 6. * To sum it up, at least 6 bits are available. */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) #define SECTION_IS_ONLINE (1UL<<2) #define SECTION_IS_EARLY (1UL<<3) #define SECTION_MAP_LAST_BIT (1UL<<4) #define SECTION_MAP_MASK (~(SECTION_MAP_LAST_BIT-1)) #define SECTION_NID_SHIFT 3 static inline struct page *__section_mem_map_addr(struct mem_section *section) { unsigned long map = section->section_mem_map; map &= SECTION_MAP_MASK; return (struct page *)map; } static inline int present_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_MARKED_PRESENT)); } static inline int present_section_nr(unsigned long nr) { return present_section(__nr_to_section(nr)); } static inline int valid_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_HAS_MEM_MAP)); } static inline int early_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_EARLY)); } static inline int valid_section_nr(unsigned long nr) { return valid_section(__nr_to_section(nr)); } static inline int online_section(struct mem_section *section) { return (section && (section->section_mem_map & SECTION_IS_ONLINE)); } static inline int online_section_nr(unsigned long nr) { return online_section(__nr_to_section(nr)); } #ifdef CONFIG_MEMORY_HOTPLUG void online_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #ifdef CONFIG_MEMORY_HOTREMOVE void offline_mem_sections(unsigned long start_pfn, unsigned long end_pfn); #endif #endif static inline struct mem_section *__pfn_to_section(unsigned long pfn) { return __nr_to_section(pfn_to_section_nr(pfn)); } extern unsigned long __highest_present_section_nr; static inline int subsection_map_index(unsigned long pfn) { return (pfn & ~(PAGE_SECTION_MASK)) / PAGES_PER_SUBSECTION; } #ifdef CONFIG_SPARSEMEM_VMEMMAP static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { int idx = subsection_map_index(pfn); return test_bit(idx, ms->usage->subsection_map); } #else static inline int pfn_section_valid(struct mem_section *ms, unsigned long pfn) { return 1; } #endif #ifndef CONFIG_HAVE_ARCH_PFN_VALID static inline int pfn_valid(unsigned long pfn) { struct mem_section *ms; if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; ms = __nr_to_section(pfn_to_section_nr(pfn)); if (!valid_section(ms)) return 0; /* * Traditionally early sections always returned pfn_valid() for * the entire section-sized span. */ return early_section(ms) || pfn_section_valid(ms, pfn); } #endif static inline int pfn_in_present_section(unsigned long pfn) { if (pfn_to_section_nr(pfn) >= NR_MEM_SECTIONS) return 0; return present_section(__nr_to_section(pfn_to_section_nr(pfn))); } static inline unsigned long next_present_section_nr(unsigned long section_nr) { while (++section_nr <= __highest_present_section_nr) { if (present_section_nr(section_nr)) return section_nr; } return -1; } /* * These are _only_ used during initialisation, therefore they * can use __initdata ... They could have names to indicate * this restriction. */ #ifdef CONFIG_NUMA #define pfn_to_nid(pfn) \ ({ \ unsigned long __pfn_to_nid_pfn = (pfn); \ page_to_nid(pfn_to_page(__pfn_to_nid_pfn)); \ }) #else #define pfn_to_nid(pfn) (0) #endif void sparse_init(void); #else #define sparse_init() do {} while (0) #define sparse_index_init(_sec, _nid) do {} while (0) #define pfn_in_present_section pfn_valid #define subsection_map_init(_pfn, _nr_pages) do {} while (0) #endif /* CONFIG_SPARSEMEM */ /* * During memory init memblocks map pfns to nids. The search is expensive and * this caches recent lookups. The implementation of __early_pfn_to_nid * may treat start/end as pfns or sections. */ struct mminit_pfnnid_cache { unsigned long last_start; unsigned long last_end; int last_nid; }; /* * If it is possible to have holes within a MAX_ORDER_NR_PAGES, then we * need to check pfn validity within that MAX_ORDER_NR_PAGES block. * pfn_valid_within() should be used in this case; we optimise this away * when we have no holes within a MAX_ORDER_NR_PAGES block. */ #ifdef CONFIG_HOLES_IN_ZONE #define pfn_valid_within(pfn) pfn_valid(pfn) #else #define pfn_valid_within(pfn) (1) #endif #ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL /* * pfn_valid() is meant to be able to tell if a given PFN has valid memmap * associated with it or not. This means that a struct page exists for this * pfn. The caller cannot assume the page is fully initialized in general. * Hotplugable pages might not have been onlined yet. pfn_to_online_page() * will ensure the struct page is fully online and initialized. Special pages * (e.g. ZONE_DEVICE) are never onlined and should be treated accordingly. * * In FLATMEM, it is expected that holes always have valid memmap as long as * there is valid PFNs either side of the hole. In SPARSEMEM, it is assumed * that a valid section has a memmap for the entire section. * * However, an ARM, and maybe other embedded architectures in the future * free memmap backing holes to save memory on the assumption the memmap is * never used. The page_zone linkages are then broken even though pfn_valid() * returns true. A walker of the full memmap must then do this additional * check to ensure the memmap they are looking at is sane by making sure * the zone and PFN linkages are still valid. This is expensive, but walkers * of the full memmap are extremely rare. */ bool memmap_valid_within(unsigned long pfn, struct page *page, struct zone *zone); #else static inline bool memmap_valid_within(unsigned long pfn, struct page *page, struct zone *zone) { return true; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ #endif /* !__GENERATING_BOUNDS.H */ #endif /* !__ASSEMBLY__ */ #endif /* _LINUX_MMZONE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tlb #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TLB_H #include <linux/mm_types.h> #include <linux/tracepoint.h> #define TLB_FLUSH_REASON \ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace * via TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); TLB_FLUSH_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } TRACE_EVENT(tlb_flush, TP_PROTO(int reason, unsigned long pages), TP_ARGS(reason, pages), TP_STRUCT__entry( __field( int, reason) __field(unsigned long, pages) ), TP_fast_assign( __entry->reason = reason; __entry->pages = pages; ), TP_printk("pages:%ld reason:%s (%d)", __entry->pages, __print_symbolic(__entry->reason, TLB_FLUSH_REASON), __entry->reason) ); #endif /* _TRACE_TLB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CLEANCACHE_H #define _LINUX_CLEANCACHE_H #include <linux/fs.h> #include <linux/exportfs.h> #include <linux/mm.h> #define CLEANCACHE_NO_POOL -1 #define CLEANCACHE_NO_BACKEND -2 #define CLEANCACHE_NO_BACKEND_SHARED -3 #define CLEANCACHE_KEY_MAX 6 /* * cleancache requires every file with a page in cleancache to have a * unique key unless/until the file is removed/truncated. For some * filesystems, the inode number is unique, but for "modern" filesystems * an exportable filehandle is required (see exportfs.h) */ struct cleancache_filekey { union { ino_t ino; __u32 fh[CLEANCACHE_KEY_MAX]; u32 key[CLEANCACHE_KEY_MAX]; } u; }; struct cleancache_ops { int (*init_fs)(size_t); int (*init_shared_fs)(uuid_t *uuid, size_t); int (*get_page)(int, struct cleancache_filekey, pgoff_t, struct page *); void (*put_page)(int, struct cleancache_filekey, pgoff_t, struct page *); void (*invalidate_page)(int, struct cleancache_filekey, pgoff_t); void (*invalidate_inode)(int, struct cleancache_filekey); void (*invalidate_fs)(int); }; extern int cleancache_register_ops(const struct cleancache_ops *ops); extern void __cleancache_init_fs(struct super_block *); extern void __cleancache_init_shared_fs(struct super_block *); extern int __cleancache_get_page(struct page *); extern void __cleancache_put_page(struct page *); extern void __cleancache_invalidate_page(struct address_space *, struct page *); extern void __cleancache_invalidate_inode(struct address_space *); extern void __cleancache_invalidate_fs(struct super_block *); #ifdef CONFIG_CLEANCACHE #define cleancache_enabled (1) static inline bool cleancache_fs_enabled_mapping(struct address_space *mapping) { return mapping->host->i_sb->cleancache_poolid >= 0; } static inline bool cleancache_fs_enabled(struct page *page) { return cleancache_fs_enabled_mapping(page->mapping); } #else #define cleancache_enabled (0) #define cleancache_fs_enabled(_page) (0) #define cleancache_fs_enabled_mapping(_page) (0) #endif /* * The shim layer provided by these inline functions allows the compiler * to reduce all cleancache hooks to nothingness if CONFIG_CLEANCACHE * is disabled, to a single global variable check if CONFIG_CLEANCACHE * is enabled but no cleancache "backend" has dynamically enabled it, * and, for the most frequent cleancache ops, to a single global variable * check plus a superblock element comparison if CONFIG_CLEANCACHE is enabled * and a cleancache backend has dynamically enabled cleancache, but the * filesystem referenced by that cleancache op has not enabled cleancache. * As a result, CONFIG_CLEANCACHE can be enabled by default with essentially * no measurable performance impact. */ static inline void cleancache_init_fs(struct super_block *sb) { if (cleancache_enabled) __cleancache_init_fs(sb); } static inline void cleancache_init_shared_fs(struct super_block *sb) { if (cleancache_enabled) __cleancache_init_shared_fs(sb); } static inline int cleancache_get_page(struct page *page) { if (cleancache_enabled && cleancache_fs_enabled(page)) return __cleancache_get_page(page); return -1; } static inline void cleancache_put_page(struct page *page) { if (cleancache_enabled && cleancache_fs_enabled(page)) __cleancache_put_page(page); } static inline void cleancache_invalidate_page(struct address_space *mapping, struct page *page) { /* careful... page->mapping is NULL sometimes when this is called */ if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) __cleancache_invalidate_page(mapping, page); } static inline void cleancache_invalidate_inode(struct address_space *mapping) { if (cleancache_enabled && cleancache_fs_enabled_mapping(mapping)) __cleancache_invalidate_inode(mapping); } static inline void cleancache_invalidate_fs(struct super_block *sb) { if (cleancache_enabled) __cleancache_invalidate_fs(sb); } #endif /* _LINUX_CLEANCACHE_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 // SPDX-License-Identifier: GPL-2.0 /* * device.h - generic, centralized driver model * * Copyright (c) 2001-2003 Patrick Mochel <mochel@osdl.org> * Copyright (c) 2004-2009 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2008-2009 Novell Inc. * * See Documentation/driver-api/driver-model/ for more information. */ #ifndef _DEVICE_H_ #define _DEVICE_H_ #include <linux/dev_printk.h> #include <linux/energy_model.h> #include <linux/ioport.h> #include <linux/kobject.h> #include <linux/klist.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/compiler.h> #include <linux/types.h> #include <linux/mutex.h> #include <linux/pm.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/gfp.h> #include <linux/overflow.h> #include <linux/device/bus.h> #include <linux/device/class.h> #include <linux/device/driver.h> #include <asm/device.h> struct device; struct device_private; struct device_driver; struct driver_private; struct module; struct class; struct subsys_private; struct device_node; struct fwnode_handle; struct iommu_ops; struct iommu_group; struct dev_pin_info; struct dev_iommu; /** * struct subsys_interface - interfaces to device functions * @name: name of the device function * @subsys: subsytem of the devices to attach to * @node: the list of functions registered at the subsystem * @add_dev: device hookup to device function handler * @remove_dev: device hookup to device function handler * * Simple interfaces attached to a subsystem. Multiple interfaces can * attach to a subsystem and its devices. Unlike drivers, they do not * exclusively claim or control devices. Interfaces usually represent * a specific functionality of a subsystem/class of devices. */ struct subsys_interface { const char *name; struct bus_type *subsys; struct list_head node; int (*add_dev)(struct device *dev, struct subsys_interface *sif); void (*remove_dev)(struct device *dev, struct subsys_interface *sif); }; int subsys_interface_register(struct subsys_interface *sif); void subsys_interface_unregister(struct subsys_interface *sif); int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups); int subsys_virtual_register(struct bus_type *subsys, const struct attribute_group **groups); /* * The type of device, "struct device" is embedded in. A class * or bus can contain devices of different types * like "partitions" and "disks", "mouse" and "event". * This identifies the device type and carries type-specific * information, equivalent to the kobj_type of a kobject. * If "name" is specified, the uevent will contain it in * the DEVTYPE variable. */ struct device_type { const char *name; const struct attribute_group **groups; int (*uevent)(struct device *dev, struct kobj_uevent_env *env); char *(*devnode)(struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid); void (*release)(struct device *dev); const struct dev_pm_ops *pm; }; /* interface for exporting device attributes */ struct device_attribute { struct attribute attr; ssize_t (*show)(struct device *dev, struct device_attribute *attr, char *buf); ssize_t (*store)(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); }; struct dev_ext_attribute { struct device_attribute attr; void *var; }; ssize_t device_show_ulong(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_ulong(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_int(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_int(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); ssize_t device_show_bool(struct device *dev, struct device_attribute *attr, char *buf); ssize_t device_store_bool(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); #define DEVICE_ATTR(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = __ATTR(_name, _mode, _show, _store) #define DEVICE_ATTR_PREALLOC(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_PREALLOC(_name, _mode, _show, _store) #define DEVICE_ATTR_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW(_name) #define DEVICE_ATTR_ADMIN_RW(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RW_MODE(_name, 0600) #define DEVICE_ATTR_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO(_name) #define DEVICE_ATTR_ADMIN_RO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_RO_MODE(_name, 0400) #define DEVICE_ATTR_WO(_name) \ struct device_attribute dev_attr_##_name = __ATTR_WO(_name) #define DEVICE_ULONG_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_ulong, device_store_ulong), &(_var) } #define DEVICE_INT_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_int, device_store_int), &(_var) } #define DEVICE_BOOL_ATTR(_name, _mode, _var) \ struct dev_ext_attribute dev_attr_##_name = \ { __ATTR(_name, _mode, device_show_bool, device_store_bool), &(_var) } #define DEVICE_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct device_attribute dev_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) int device_create_file(struct device *device, const struct device_attribute *entry); void device_remove_file(struct device *dev, const struct device_attribute *attr); bool device_remove_file_self(struct device *dev, const struct device_attribute *attr); int __must_check device_create_bin_file(struct device *dev, const struct bin_attribute *attr); void device_remove_bin_file(struct device *dev, const struct bin_attribute *attr); /* device resource management */ typedef void (*dr_release_t)(struct device *dev, void *res); typedef int (*dr_match_t)(struct device *dev, void *res, void *match_data); #ifdef CONFIG_DEBUG_DEVRES void *__devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid, const char *name) __malloc; #define devres_alloc(release, size, gfp) \ __devres_alloc_node(release, size, gfp, NUMA_NO_NODE, #release) #define devres_alloc_node(release, size, gfp, nid) \ __devres_alloc_node(release, size, gfp, nid, #release) #else void *devres_alloc_node(dr_release_t release, size_t size, gfp_t gfp, int nid) __malloc; static inline void *devres_alloc(dr_release_t release, size_t size, gfp_t gfp) { return devres_alloc_node(release, size, gfp, NUMA_NO_NODE); } #endif void devres_for_each_res(struct device *dev, dr_release_t release, dr_match_t match, void *match_data, void (*fn)(struct device *, void *, void *), void *data); void devres_free(void *res); void devres_add(struct device *dev, void *res); void *devres_find(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); void *devres_get(struct device *dev, void *new_res, dr_match_t match, void *match_data); void *devres_remove(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); int devres_destroy(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); int devres_release(struct device *dev, dr_release_t release, dr_match_t match, void *match_data); /* devres group */ void * __must_check devres_open_group(struct device *dev, void *id, gfp_t gfp); void devres_close_group(struct device *dev, void *id); void devres_remove_group(struct device *dev, void *id); int devres_release_group(struct device *dev, void *id); /* managed devm_k.alloc/kfree for device drivers */ void *devm_kmalloc(struct device *dev, size_t size, gfp_t gfp) __malloc; void *devm_krealloc(struct device *dev, void *ptr, size_t size, gfp_t gfp) __must_check; __printf(3, 0) char *devm_kvasprintf(struct device *dev, gfp_t gfp, const char *fmt, va_list ap) __malloc; __printf(3, 4) char *devm_kasprintf(struct device *dev, gfp_t gfp, const char *fmt, ...) __malloc; static inline void *devm_kzalloc(struct device *dev, size_t size, gfp_t gfp) { return devm_kmalloc(dev, size, gfp | __GFP_ZERO); } static inline void *devm_kmalloc_array(struct device *dev, size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return devm_kmalloc(dev, bytes, flags); } static inline void *devm_kcalloc(struct device *dev, size_t n, size_t size, gfp_t flags) { return devm_kmalloc_array(dev, n, size, flags | __GFP_ZERO); } void devm_kfree(struct device *dev, const void *p); char *devm_kstrdup(struct device *dev, const char *s, gfp_t gfp) __malloc; const char *devm_kstrdup_const(struct device *dev, const char *s, gfp_t gfp); void *devm_kmemdup(struct device *dev, const void *src, size_t len, gfp_t gfp); unsigned long devm_get_free_pages(struct device *dev, gfp_t gfp_mask, unsigned int order); void devm_free_pages(struct device *dev, unsigned long addr); void __iomem *devm_ioremap_resource(struct device *dev, const struct resource *res); void __iomem *devm_ioremap_resource_wc(struct device *dev, const struct resource *res); void __iomem *devm_of_iomap(struct device *dev, struct device_node *node, int index, resource_size_t *size); /* allows to add/remove a custom action to devres stack */ int devm_add_action(struct device *dev, void (*action)(void *), void *data); void devm_remove_action(struct device *dev, void (*action)(void *), void *data); void devm_release_action(struct device *dev, void (*action)(void *), void *data); static inline int devm_add_action_or_reset(struct device *dev, void (*action)(void *), void *data) { int ret; ret = devm_add_action(dev, action, data); if (ret) action(data); return ret; } /** * devm_alloc_percpu - Resource-managed alloc_percpu * @dev: Device to allocate per-cpu memory for * @type: Type to allocate per-cpu memory for * * Managed alloc_percpu. Per-cpu memory allocated with this function is * automatically freed on driver detach. * * RETURNS: * Pointer to allocated memory on success, NULL on failure. */ #define devm_alloc_percpu(dev, type) \ ((typeof(type) __percpu *)__devm_alloc_percpu((dev), sizeof(type), \ __alignof__(type))) void __percpu *__devm_alloc_percpu(struct device *dev, size_t size, size_t align); void devm_free_percpu(struct device *dev, void __percpu *pdata); struct device_dma_parameters { /* * a low level driver may set these to teach IOMMU code about * sg limitations. */ unsigned int max_segment_size; unsigned int min_align_mask; unsigned long segment_boundary_mask; }; /** * enum device_link_state - Device link states. * @DL_STATE_NONE: The presence of the drivers is not being tracked. * @DL_STATE_DORMANT: None of the supplier/consumer drivers is present. * @DL_STATE_AVAILABLE: The supplier driver is present, but the consumer is not. * @DL_STATE_CONSUMER_PROBE: The consumer is probing (supplier driver present). * @DL_STATE_ACTIVE: Both the supplier and consumer drivers are present. * @DL_STATE_SUPPLIER_UNBIND: The supplier driver is unbinding. */ enum device_link_state { DL_STATE_NONE = -1, DL_STATE_DORMANT = 0, DL_STATE_AVAILABLE, DL_STATE_CONSUMER_PROBE, DL_STATE_ACTIVE, DL_STATE_SUPPLIER_UNBIND, }; /* * Device link flags. * * STATELESS: The core will not remove this link automatically. * AUTOREMOVE_CONSUMER: Remove the link automatically on consumer driver unbind. * PM_RUNTIME: If set, the runtime PM framework will use this link. * RPM_ACTIVE: Run pm_runtime_get_sync() on the supplier during link creation. * AUTOREMOVE_SUPPLIER: Remove the link automatically on supplier driver unbind. * AUTOPROBE_CONSUMER: Probe consumer driver automatically after supplier binds. * MANAGED: The core tracks presence of supplier/consumer drivers (internal). * SYNC_STATE_ONLY: Link only affects sync_state() behavior. */ #define DL_FLAG_STATELESS BIT(0) #define DL_FLAG_AUTOREMOVE_CONSUMER BIT(1) #define DL_FLAG_PM_RUNTIME BIT(2) #define DL_FLAG_RPM_ACTIVE BIT(3) #define DL_FLAG_AUTOREMOVE_SUPPLIER BIT(4) #define DL_FLAG_AUTOPROBE_CONSUMER BIT(5) #define DL_FLAG_MANAGED BIT(6) #define DL_FLAG_SYNC_STATE_ONLY BIT(7) /** * enum dl_dev_state - Device driver presence tracking information. * @DL_DEV_NO_DRIVER: There is no driver attached to the device. * @DL_DEV_PROBING: A driver is probing. * @DL_DEV_DRIVER_BOUND: The driver has been bound to the device. * @DL_DEV_UNBINDING: The driver is unbinding from the device. */ enum dl_dev_state { DL_DEV_NO_DRIVER = 0, DL_DEV_PROBING, DL_DEV_DRIVER_BOUND, DL_DEV_UNBINDING, }; /** * struct dev_links_info - Device data related to device links. * @suppliers: List of links to supplier devices. * @consumers: List of links to consumer devices. * @needs_suppliers: Hook to global list of devices waiting for suppliers. * @defer_hook: Hook to global list of devices that have deferred sync_state or * deferred fw_devlink. * @need_for_probe: If needs_suppliers is on a list, this indicates if the * suppliers are needed for probe or not. * @status: Driver status information. */ struct dev_links_info { struct list_head suppliers; struct list_head consumers; struct list_head needs_suppliers; struct list_head defer_hook; bool need_for_probe; enum dl_dev_state status; }; /** * struct device - The basic device structure * @parent: The device's "parent" device, the device to which it is attached. * In most cases, a parent device is some sort of bus or host * controller. If parent is NULL, the device, is a top-level device, * which is not usually what you want. * @p: Holds the private data of the driver core portions of the device. * See the comment of the struct device_private for detail. * @kobj: A top-level, abstract class from which other classes are derived. * @init_name: Initial name of the device. * @type: The type of device. * This identifies the device type and carries type-specific * information. * @mutex: Mutex to synchronize calls to its driver. * @lockdep_mutex: An optional debug lock that a subsystem can use as a * peer lock to gain localized lockdep coverage of the device_lock. * @bus: Type of bus device is on. * @driver: Which driver has allocated this * @platform_data: Platform data specific to the device. * Example: For devices on custom boards, as typical of embedded * and SOC based hardware, Linux often uses platform_data to point * to board-specific structures describing devices and how they * are wired. That can include what ports are available, chip * variants, which GPIO pins act in what additional roles, and so * on. This shrinks the "Board Support Packages" (BSPs) and * minimizes board-specific #ifdefs in drivers. * @driver_data: Private pointer for driver specific info. * @links: Links to suppliers and consumers of this device. * @power: For device power management. * See Documentation/driver-api/pm/devices.rst for details. * @pm_domain: Provide callbacks that are executed during system suspend, * hibernation, system resume and during runtime PM transitions * along with subsystem-level and driver-level callbacks. * @em_pd: device's energy model performance domain * @pins: For device pin management. * See Documentation/driver-api/pinctl.rst for details. * @msi_list: Hosts MSI descriptors * @msi_domain: The generic MSI domain this device is using. * @numa_node: NUMA node this device is close to. * @dma_ops: DMA mapping operations for this device. * @dma_mask: Dma mask (if dma'ble device). * @coherent_dma_mask: Like dma_mask, but for alloc_coherent mapping as not all * hardware supports 64-bit addresses for consistent allocations * such descriptors. * @bus_dma_limit: Limit of an upstream bridge or bus which imposes a smaller * DMA limit than the device itself supports. * @dma_range_map: map for DMA memory ranges relative to that of RAM * @dma_parms: A low level driver may set these to teach IOMMU code about * segment limitations. * @dma_pools: Dma pools (if dma'ble device). * @dma_mem: Internal for coherent mem override. * @cma_area: Contiguous memory area for dma allocations * @archdata: For arch-specific additions. * @of_node: Associated device tree node. * @fwnode: Associated device node supplied by platform firmware. * @devt: For creating the sysfs "dev". * @id: device instance * @devres_lock: Spinlock to protect the resource of the device. * @devres_head: The resources list of the device. * @knode_class: The node used to add the device to the class list. * @class: The class of the device. * @groups: Optional attribute groups. * @release: Callback to free the device after all references have * gone away. This should be set by the allocator of the * device (i.e. the bus driver that discovered the device). * @iommu_group: IOMMU group the device belongs to. * @iommu: Per device generic IOMMU runtime data * * @offline_disabled: If set, the device is permanently online. * @offline: Set after successful invocation of bus type's .offline(). * @of_node_reused: Set if the device-tree node is shared with an ancestor * device. * @state_synced: The hardware state of this device has been synced to match * the software state of this device by calling the driver/bus * sync_state() callback. * @dma_coherent: this particular device is dma coherent, even if the * architecture supports non-coherent devices. * @dma_ops_bypass: If set to %true then the dma_ops are bypassed for the * streaming DMA operations (->map_* / ->unmap_* / ->sync_*), * and optionall (if the coherent mask is large enough) also * for dma allocations. This flag is managed by the dma ops * instance from ->dma_supported. * * At the lowest level, every device in a Linux system is represented by an * instance of struct device. The device structure contains the information * that the device model core needs to model the system. Most subsystems, * however, track additional information about the devices they host. As a * result, it is rare for devices to be represented by bare device structures; * instead, that structure, like kobject structures, is usually embedded within * a higher-level representation of the device. */ struct device { struct kobject kobj; struct device *parent; struct device_private *p; const char *init_name; /* initial name of the device */ const struct device_type *type; struct bus_type *bus; /* type of bus device is on */ struct device_driver *driver; /* which driver has allocated this device */ void *platform_data; /* Platform specific data, device core doesn't touch it */ void *driver_data; /* Driver data, set and get with dev_set_drvdata/dev_get_drvdata */ #ifdef CONFIG_PROVE_LOCKING struct mutex lockdep_mutex; #endif struct mutex mutex; /* mutex to synchronize calls to * its driver. */ struct dev_links_info links; struct dev_pm_info power; struct dev_pm_domain *pm_domain; #ifdef CONFIG_ENERGY_MODEL struct em_perf_domain *em_pd; #endif #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN struct irq_domain *msi_domain; #endif #ifdef CONFIG_PINCTRL struct dev_pin_info *pins; #endif #ifdef CONFIG_GENERIC_MSI_IRQ raw_spinlock_t msi_lock; struct list_head msi_list; #endif #ifdef CONFIG_DMA_OPS const struct dma_map_ops *dma_ops; #endif u64 *dma_mask; /* dma mask (if dma'able device) */ u64 coherent_dma_mask;/* Like dma_mask, but for alloc_coherent mappings as not all hardware supports 64 bit addresses for consistent allocations such descriptors. */ u64 bus_dma_limit; /* upstream dma constraint */ const struct bus_dma_region *dma_range_map; struct device_dma_parameters *dma_parms; struct list_head dma_pools; /* dma pools (if dma'ble) */ #ifdef CONFIG_DMA_DECLARE_COHERENT struct dma_coherent_mem *dma_mem; /* internal for coherent mem override */ #endif #ifdef CONFIG_DMA_CMA struct cma *cma_area; /* contiguous memory area for dma allocations */ #endif /* arch specific additions */ struct dev_archdata archdata; struct device_node *of_node; /* associated device tree node */ struct fwnode_handle *fwnode; /* firmware device node */ #ifdef CONFIG_NUMA int numa_node; /* NUMA node this device is close to */ #endif dev_t devt; /* dev_t, creates the sysfs "dev" */ u32 id; /* device instance */ spinlock_t devres_lock; struct list_head devres_head; struct class *class; const struct attribute_group **groups; /* optional groups */ void (*release)(struct device *dev); struct iommu_group *iommu_group; struct dev_iommu *iommu; bool offline_disabled:1; bool offline:1; bool of_node_reused:1; bool state_synced:1; #if defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_DEVICE) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU) || \ defined(CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL) bool dma_coherent:1; #endif #ifdef CONFIG_DMA_OPS_BYPASS bool dma_ops_bypass : 1; #endif }; /** * struct device_link - Device link representation. * @supplier: The device on the supplier end of the link. * @s_node: Hook to the supplier device's list of links to consumers. * @consumer: The device on the consumer end of the link. * @c_node: Hook to the consumer device's list of links to suppliers. * @link_dev: device used to expose link details in sysfs * @status: The state of the link (with respect to the presence of drivers). * @flags: Link flags. * @rpm_active: Whether or not the consumer device is runtime-PM-active. * @kref: Count repeated addition of the same link. * @rm_work: Work structure used for removing the link. * @supplier_preactivated: Supplier has been made active before consumer probe. */ struct device_link { struct device *supplier; struct list_head s_node; struct device *consumer; struct list_head c_node; struct device link_dev; enum device_link_state status; u32 flags; refcount_t rpm_active; struct kref kref; struct work_struct rm_work; bool supplier_preactivated; /* Owned by consumer probe. */ }; static inline struct device *kobj_to_dev(struct kobject *kobj) { return container_of(kobj, struct device, kobj); } /** * device_iommu_mapped - Returns true when the device DMA is translated * by an IOMMU * @dev: Device to perform the check on */ static inline bool device_iommu_mapped(struct device *dev) { return (dev->iommu_group != NULL); } /* Get the wakeup routines, which depend on struct device */ #include <linux/pm_wakeup.h> static inline const char *dev_name(const struct device *dev) { /* Use the init name until the kobject becomes available */ if (dev->init_name) return dev->init_name; return kobject_name(&dev->kobj); } /** * dev_bus_name - Return a device's bus/class name, if at all possible * @dev: struct device to get the bus/class name of * * Will return the name of the bus/class the device is attached to. If it is * not attached to a bus/class, an empty string will be returned. */ static inline const char *dev_bus_name(const struct device *dev) { return dev->bus ? dev->bus->name : (dev->class ? dev->class->name : ""); } __printf(2, 3) int dev_set_name(struct device *dev, const char *name, ...); #ifdef CONFIG_NUMA static inline int dev_to_node(struct device *dev) { return dev->numa_node; } static inline void set_dev_node(struct device *dev, int node) { dev->numa_node = node; } #else static inline int dev_to_node(struct device *dev) { return NUMA_NO_NODE; } static inline void set_dev_node(struct device *dev, int node) { } #endif static inline struct irq_domain *dev_get_msi_domain(const struct device *dev) { #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN return dev->msi_domain; #else return NULL; #endif } static inline void dev_set_msi_domain(struct device *dev, struct irq_domain *d) { #ifdef CONFIG_GENERIC_MSI_IRQ_DOMAIN dev->msi_domain = d; #endif } static inline void *dev_get_drvdata(const struct device *dev) { return dev->driver_data; } static inline void dev_set_drvdata(struct device *dev, void *data) { dev->driver_data = data; } static inline struct pm_subsys_data *dev_to_psd(struct device *dev) { return dev ? dev->power.subsys_data : NULL; } static inline unsigned int dev_get_uevent_suppress(const struct device *dev) { return dev->kobj.uevent_suppress; } static inline void dev_set_uevent_suppress(struct device *dev, int val) { dev->kobj.uevent_suppress = val; } static inline int device_is_registered(struct device *dev) { return dev->kobj.state_in_sysfs; } static inline void device_enable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = true; } static inline void device_disable_async_suspend(struct device *dev) { if (!dev->power.is_prepared) dev->power.async_suspend = false; } static inline bool device_async_suspend_enabled(struct device *dev) { return !!dev->power.async_suspend; } static inline bool device_pm_not_required(struct device *dev) { return dev->power.no_pm; } static inline void device_set_pm_not_required(struct device *dev) { dev->power.no_pm = true; } static inline void dev_pm_syscore_device(struct device *dev, bool val) { #ifdef CONFIG_PM_SLEEP dev->power.syscore = val; #endif } static inline void dev_pm_set_driver_flags(struct device *dev, u32 flags) { dev->power.driver_flags = flags; } static inline bool dev_pm_test_driver_flags(struct device *dev, u32 flags) { return !!(dev->power.driver_flags & flags); } static inline void device_lock(struct device *dev) { mutex_lock(&dev->mutex); } static inline int device_lock_interruptible(struct device *dev) { return mutex_lock_interruptible(&dev->mutex); } static inline int device_trylock(struct device *dev) { return mutex_trylock(&dev->mutex); } static inline void device_unlock(struct device *dev) { mutex_unlock(&dev->mutex); } static inline void device_lock_assert(struct device *dev) { lockdep_assert_held(&dev->mutex); } static inline struct device_node *dev_of_node(struct device *dev) { if (!IS_ENABLED(CONFIG_OF) || !dev) return NULL; return dev->of_node; } static inline bool dev_has_sync_state(struct device *dev) { if (!dev) return false; if (dev->driver && dev->driver->sync_state) return true; if (dev->bus && dev->bus->sync_state) return true; return false; } /* * High level routines for use by the bus drivers */ int __must_check device_register(struct device *dev); void device_unregister(struct device *dev); void device_initialize(struct device *dev); int __must_check device_add(struct device *dev); void device_del(struct device *dev); int device_for_each_child(struct device *dev, void *data, int (*fn)(struct device *dev, void *data)); int device_for_each_child_reverse(struct device *dev, void *data, int (*fn)(struct device *dev, void *data)); struct device *device_find_child(struct device *dev, void *data, int (*match)(struct device *dev, void *data)); struct device *device_find_child_by_name(struct device *parent, const char *name); int device_rename(struct device *dev, const char *new_name); int device_move(struct device *dev, struct device *new_parent, enum dpm_order dpm_order); int device_change_owner(struct device *dev, kuid_t kuid, kgid_t kgid); const char *device_get_devnode(struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid, const char **tmp); int device_is_dependent(struct device *dev, void *target); static inline bool device_supports_offline(struct device *dev) { return dev->bus && dev->bus->offline && dev->bus->online; } void lock_device_hotplug(void); void unlock_device_hotplug(void); int lock_device_hotplug_sysfs(void); int device_offline(struct device *dev); int device_online(struct device *dev); void set_primary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void set_secondary_fwnode(struct device *dev, struct fwnode_handle *fwnode); void device_set_of_node_from_dev(struct device *dev, const struct device *dev2); static inline int dev_num_vf(struct device *dev) { if (dev->bus && dev->bus->num_vf) return dev->bus->num_vf(dev); return 0; } /* * Root device objects for grouping under /sys/devices */ struct device *__root_device_register(const char *name, struct module *owner); /* This is a macro to avoid include problems with THIS_MODULE */ #define root_device_register(name) \ __root_device_register(name, THIS_MODULE) void root_device_unregister(struct device *root); static inline void *dev_get_platdata(const struct device *dev) { return dev->platform_data; } /* * Manual binding of a device to driver. See drivers/base/bus.c * for information on use. */ int __must_check device_bind_driver(struct device *dev); void device_release_driver(struct device *dev); int __must_check device_attach(struct device *dev); int __must_check driver_attach(struct device_driver *drv); void device_initial_probe(struct device *dev); int __must_check device_reprobe(struct device *dev); bool device_is_bound(struct device *dev); /* * Easy functions for dynamically creating devices on the fly */ __printf(5, 6) struct device * device_create(struct class *cls, struct device *parent, dev_t devt, void *drvdata, const char *fmt, ...); __printf(6, 7) struct device * device_create_with_groups(struct class *cls, struct device *parent, dev_t devt, void *drvdata, const struct attribute_group **groups, const char *fmt, ...); void device_destroy(struct class *cls, dev_t devt); int __must_check device_add_groups(struct device *dev, const struct attribute_group **groups); void device_remove_groups(struct device *dev, const struct attribute_group **groups); static inline int __must_check device_add_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; return device_add_groups(dev, groups); } static inline void device_remove_group(struct device *dev, const struct attribute_group *grp) { const struct attribute_group *groups[] = { grp, NULL }; return device_remove_groups(dev, groups); } int __must_check devm_device_add_groups(struct device *dev, const struct attribute_group **groups); void devm_device_remove_groups(struct device *dev, const struct attribute_group **groups); int __must_check devm_device_add_group(struct device *dev, const struct attribute_group *grp); void devm_device_remove_group(struct device *dev, const struct attribute_group *grp); /* * Platform "fixup" functions - allow the platform to have their say * about devices and actions that the general device layer doesn't * know about. */ /* Notify platform of device discovery */ extern int (*platform_notify)(struct device *dev); extern int (*platform_notify_remove)(struct device *dev); /* * get_device - atomically increment the reference count for the device. * */ struct device *get_device(struct device *dev); void put_device(struct device *dev); bool kill_device(struct device *dev); #ifdef CONFIG_DEVTMPFS int devtmpfs_mount(void); #else static inline int devtmpfs_mount(void) { return 0; } #endif /* drivers/base/power/shutdown.c */ void device_shutdown(void); /* debugging and troubleshooting/diagnostic helpers. */ const char *dev_driver_string(const struct device *dev); /* Device links interface. */ struct device_link *device_link_add(struct device *consumer, struct device *supplier, u32 flags); void device_link_del(struct device_link *link); void device_link_remove(void *consumer, struct device *supplier); void device_links_supplier_sync_state_pause(void); void device_links_supplier_sync_state_resume(void); extern __printf(3, 4) int dev_err_probe(const struct device *dev, int err, const char *fmt, ...); /* Create alias, so I can be autoloaded. */ #define MODULE_ALIAS_CHARDEV(major,minor) \ MODULE_ALIAS("char-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_CHARDEV_MAJOR(major) \ MODULE_ALIAS("char-major-" __stringify(major) "-*") #ifdef CONFIG_SYSFS_DEPRECATED extern long sysfs_deprecated; #else #define sysfs_deprecated 0 #endif #endif /* _DEVICE_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_WORD_AT_A_TIME_H #define _ASM_WORD_AT_A_TIME_H #include <linux/kernel.h> /* * This is largely generic for little-endian machines, but the * optimal byte mask counting is probably going to be something * that is architecture-specific. If you have a reliably fast * bit count instruction, that might be better than the multiply * and shift, for example. */ struct word_at_a_time { const unsigned long one_bits, high_bits; }; #define WORD_AT_A_TIME_CONSTANTS { REPEAT_BYTE(0x01), REPEAT_BYTE(0x80) } #ifdef CONFIG_64BIT /* * Jan Achrenius on G+: microoptimized version of * the simpler "(mask & ONEBYTES) * ONEBYTES >> 56" * that works for the bytemasks without having to * mask them first. */ static inline long count_masked_bytes(unsigned long mask) { return mask*0x0001020304050608ul >> 56; } #else /* 32-bit case */ /* Carl Chatfield / Jan Achrenius G+ version for 32-bit */ static inline long count_masked_bytes(long mask) { /* (000000 0000ff 00ffff ffffff) -> ( 1 1 2 3 ) */ long a = (0x0ff0001+mask) >> 23; /* Fix the 1 for 00 case */ return a & mask; } #endif /* Return nonzero if it has a zero */ static inline unsigned long has_zero(unsigned long a, unsigned long *bits, const struct word_at_a_time *c) { unsigned long mask = ((a - c->one_bits) & ~a) & c->high_bits; *bits = mask; return mask; } static inline unsigned long prep_zero_mask(unsigned long a, unsigned long bits, const struct word_at_a_time *c) { return bits; } static inline unsigned long create_zero_mask(unsigned long bits) { bits = (bits - 1) & ~bits; return bits >> 7; } /* The mask we created is directly usable as a bytemask */ #define zero_bytemask(mask) (mask) static inline unsigned long find_zero(unsigned long mask) { return count_masked_bytes(mask); } /* * Load an unaligned word from kernel space. * * In the (very unlikely) case of the word being a page-crosser * and the next page not being mapped, take the exception and * return zeroes in the non-existing part. */ static inline unsigned long load_unaligned_zeropad(const void *addr) { unsigned long ret, dummy; asm( "1:\tmov %2,%0\n" "2:\n" ".section .fixup,\"ax\"\n" "3:\t" "lea %2,%1\n\t" "and %3,%1\n\t" "mov (%1),%0\n\t" "leal %2,%%ecx\n\t" "andl %4,%%ecx\n\t" "shll $3,%%ecx\n\t" "shr %%cl,%0\n\t" "jmp 2b\n" ".previous\n" _ASM_EXTABLE(1b, 3b) :"=&r" (ret),"=&c" (dummy) :"m" (*(unsigned long *)addr), "i" (-sizeof(unsigned long)), "i" (sizeof(unsigned long)-1)); return ret; } #endif /* _ASM_WORD_AT_A_TIME_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMERQUEUE_H #define _LINUX_TIMERQUEUE_H #include <linux/rbtree.h> #include <linux/ktime.h> struct timerqueue_node { struct rb_node node; ktime_t expires; }; struct timerqueue_head { struct rb_root_cached rb_root; }; extern bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); extern bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); extern struct timerqueue_node *timerqueue_iterate_next( struct timerqueue_node *node); /** * timerqueue_getnext - Returns the timer with the earliest expiration time * * @head: head of timerqueue * * Returns a pointer to the timer node that has the earliest expiration time. */ static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); return rb_entry(leftmost, struct timerqueue_node, node); } static inline void timerqueue_init(struct timerqueue_node *node) { RB_CLEAR_NODE(&node->node); } static inline bool timerqueue_node_queued(struct timerqueue_node *node) { return !RB_EMPTY_NODE(&node->node); } static inline bool timerqueue_node_expires(struct timerqueue_node *node) { return node->expires; } static inline void timerqueue_init_head(struct timerqueue_head *head) { head->rb_root = RB_ROOT_CACHED; } #endif /* _LINUX_TIMERQUEUE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ADDRCONF_H #define _ADDRCONF_H #define MAX_RTR_SOLICITATIONS -1 /* unlimited */ #define RTR_SOLICITATION_INTERVAL (4*HZ) #define RTR_SOLICITATION_MAX_INTERVAL (3600*HZ) /* 1 hour */ #define TEMP_VALID_LIFETIME (7*86400) #define TEMP_PREFERRED_LIFETIME (86400) #define REGEN_MAX_RETRY (3) #define MAX_DESYNC_FACTOR (600) #define ADDR_CHECK_FREQUENCY (120*HZ) #define IPV6_MAX_ADDRESSES 16 #define ADDRCONF_TIMER_FUZZ_MINUS (HZ > 50 ? HZ / 50 : 1) #define ADDRCONF_TIMER_FUZZ (HZ / 4) #define ADDRCONF_TIMER_FUZZ_MAX (HZ) #define ADDRCONF_NOTIFY_PRIORITY 0 #include <linux/in.h> #include <linux/in6.h> struct prefix_info { __u8 type; __u8 length; __u8 prefix_len; #if defined(__BIG_ENDIAN_BITFIELD) __u8 onlink : 1, autoconf : 1, reserved : 6; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved : 6, autoconf : 1, onlink : 1; #else #error "Please fix <asm/byteorder.h>" #endif __be32 valid; __be32 prefered; __be32 reserved2; struct in6_addr prefix; }; #include <linux/ipv6.h> #include <linux/netdevice.h> #include <net/if_inet6.h> #include <net/ipv6.h> struct in6_validator_info { struct in6_addr i6vi_addr; struct inet6_dev *i6vi_dev; struct netlink_ext_ack *extack; }; struct ifa6_config { const struct in6_addr *pfx; unsigned int plen; const struct in6_addr *peer_pfx; u32 rt_priority; u32 ifa_flags; u32 preferred_lft; u32 valid_lft; u16 scope; }; int addrconf_init(void); void addrconf_cleanup(void); int addrconf_add_ifaddr(struct net *net, void __user *arg); int addrconf_del_ifaddr(struct net *net, void __user *arg); int addrconf_set_dstaddr(struct net *net, void __user *arg); int ipv6_chk_addr(struct net *net, const struct in6_addr *addr, const struct net_device *dev, int strict); int ipv6_chk_addr_and_flags(struct net *net, const struct in6_addr *addr, const struct net_device *dev, bool skip_dev_check, int strict, u32 banned_flags); #if defined(CONFIG_IPV6_MIP6) || defined(CONFIG_IPV6_MIP6_MODULE) int ipv6_chk_home_addr(struct net *net, const struct in6_addr *addr); #endif int ipv6_chk_rpl_srh_loop(struct net *net, const struct in6_addr *segs, unsigned char nsegs); bool ipv6_chk_custom_prefix(const struct in6_addr *addr, const unsigned int prefix_len, struct net_device *dev); int ipv6_chk_prefix(const struct in6_addr *addr, struct net_device *dev); struct net_device *ipv6_dev_find(struct net *net, const struct in6_addr *addr, struct net_device *dev); struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *addr, struct net_device *dev, int strict); int ipv6_dev_get_saddr(struct net *net, const struct net_device *dev, const struct in6_addr *daddr, unsigned int srcprefs, struct in6_addr *saddr); int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr *addr, u32 banned_flags); int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr, u32 banned_flags); bool inet_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2, bool match_wildcard); bool inet_rcv_saddr_any(const struct sock *sk); void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr); void addrconf_leave_solict(struct inet6_dev *idev, const struct in6_addr *addr); void addrconf_add_linklocal(struct inet6_dev *idev, const struct in6_addr *addr, u32 flags); int addrconf_prefix_rcv_add_addr(struct net *net, struct net_device *dev, const struct prefix_info *pinfo, struct inet6_dev *in6_dev, const struct in6_addr *addr, int addr_type, u32 addr_flags, bool sllao, bool tokenized, __u32 valid_lft, u32 prefered_lft); static inline void addrconf_addr_eui48_base(u8 *eui, const char *const addr) { memcpy(eui, addr, 3); eui[3] = 0xFF; eui[4] = 0xFE; memcpy(eui + 5, addr + 3, 3); } static inline void addrconf_addr_eui48(u8 *eui, const char *const addr) { addrconf_addr_eui48_base(eui, addr); eui[0] ^= 2; } static inline int addrconf_ifid_eui48(u8 *eui, struct net_device *dev) { if (dev->addr_len != ETH_ALEN) return -1; /* * The zSeries OSA network cards can be shared among various * OS instances, but the OSA cards have only one MAC address. * This leads to duplicate address conflicts in conjunction * with IPv6 if more than one instance uses the same card. * * The driver for these cards can deliver a unique 16-bit * identifier for each instance sharing the same card. It is * placed instead of 0xFFFE in the interface identifier. The * "u" bit of the interface identifier is not inverted in this * case. Hence the resulting interface identifier has local * scope according to RFC2373. */ addrconf_addr_eui48_base(eui, dev->dev_addr); if (dev->dev_id) { eui[3] = (dev->dev_id >> 8) & 0xFF; eui[4] = dev->dev_id & 0xFF; } else { eui[0] ^= 2; } return 0; } static inline unsigned long addrconf_timeout_fixup(u32 timeout, unsigned int unit) { if (timeout == 0xffffffff) return ~0UL; /* * Avoid arithmetic overflow. * Assuming unit is constant and non-zero, this "if" statement * will go away on 64bit archs. */ if (0xfffffffe > LONG_MAX / unit && timeout > LONG_MAX / unit) return LONG_MAX / unit; return timeout; } static inline int addrconf_finite_timeout(unsigned long timeout) { return ~timeout; } /* * IPv6 Address Label subsystem (addrlabel.c) */ int ipv6_addr_label_init(void); void ipv6_addr_label_cleanup(void); int ipv6_addr_label_rtnl_register(void); u32 ipv6_addr_label(struct net *net, const struct in6_addr *addr, int type, int ifindex); /* * multicast prototypes (mcast.c) */ static inline bool ipv6_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ipv6_transport_len(skb) < len) return false; return pskb_may_pull(skb, len); } int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_mc_close(struct sock *sk); void ipv6_sock_mc_close(struct sock *sk); bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr, const struct in6_addr *src_addr); int ipv6_dev_mc_inc(struct net_device *dev, const struct in6_addr *addr); int __ipv6_dev_mc_dec(struct inet6_dev *idev, const struct in6_addr *addr); int ipv6_dev_mc_dec(struct net_device *dev, const struct in6_addr *addr); void ipv6_mc_up(struct inet6_dev *idev); void ipv6_mc_down(struct inet6_dev *idev); void ipv6_mc_unmap(struct inet6_dev *idev); void ipv6_mc_remap(struct inet6_dev *idev); void ipv6_mc_init_dev(struct inet6_dev *idev); void ipv6_mc_destroy_dev(struct inet6_dev *idev); int ipv6_mc_check_mld(struct sk_buff *skb); void addrconf_dad_failure(struct sk_buff *skb, struct inet6_ifaddr *ifp); bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, const struct in6_addr *src_addr); void ipv6_mc_dad_complete(struct inet6_dev *idev); /* * identify MLD packets for MLD filter exceptions */ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset) { struct icmp6hdr *hdr; if (nexthdr != IPPROTO_ICMPV6 || !pskb_network_may_pull(skb, offset + sizeof(struct icmp6hdr))) return false; hdr = (struct icmp6hdr *)(skb_network_header(skb) + offset); switch (hdr->icmp6_type) { case ICMPV6_MGM_QUERY: case ICMPV6_MGM_REPORT: case ICMPV6_MGM_REDUCTION: case ICMPV6_MLD2_REPORT: return true; default: break; } return false; } void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao); /* * anycast prototypes (anycast.c) */ int ipv6_sock_ac_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_ac_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); void __ipv6_sock_ac_close(struct sock *sk); void ipv6_sock_ac_close(struct sock *sk); int __ipv6_dev_ac_inc(struct inet6_dev *idev, const struct in6_addr *addr); int __ipv6_dev_ac_dec(struct inet6_dev *idev, const struct in6_addr *addr); void ipv6_ac_destroy_dev(struct inet6_dev *idev); bool ipv6_chk_acast_addr(struct net *net, struct net_device *dev, const struct in6_addr *addr); bool ipv6_chk_acast_addr_src(struct net *net, struct net_device *dev, const struct in6_addr *addr); int ipv6_anycast_init(void); void ipv6_anycast_cleanup(void); /* Device notifier */ int register_inet6addr_notifier(struct notifier_block *nb); int unregister_inet6addr_notifier(struct notifier_block *nb); int inet6addr_notifier_call_chain(unsigned long val, void *v); int register_inet6addr_validator_notifier(struct notifier_block *nb); int unregister_inet6addr_validator_notifier(struct notifier_block *nb); int inet6addr_validator_notifier_call_chain(unsigned long val, void *v); void inet6_netconf_notify_devconf(struct net *net, int event, int type, int ifindex, struct ipv6_devconf *devconf); /** * __in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_get(const struct net_device *dev) { return rcu_dereference_rtnl(dev->ip6_ptr); } /** * __in6_dev_stats_get - get inet6_dev pointer for stats * @dev: network device * @skb: skb for original incoming interface if neeeded * * Caller must hold rcu_read_lock or RTNL, because this function * does not take a reference on the inet6_dev. */ static inline struct inet6_dev *__in6_dev_stats_get(const struct net_device *dev, const struct sk_buff *skb) { if (netif_is_l3_master(dev)) dev = dev_get_by_index_rcu(dev_net(dev), inet6_iif(skb)); return __in6_dev_get(dev); } /** * __in6_dev_get_safely - get inet6_dev pointer from netdevice * @dev: network device * * This is a safer version of __in6_dev_get */ static inline struct inet6_dev *__in6_dev_get_safely(const struct net_device *dev) { if (likely(dev)) return rcu_dereference_rtnl(dev->ip6_ptr); else return NULL; } /** * in6_dev_get - get inet6_dev pointer from netdevice * @dev: network device * * This version can be used in any context, and takes a reference * on the inet6_dev. Callers must use in6_dev_put() later to * release this reference. */ static inline struct inet6_dev *in6_dev_get(const struct net_device *dev) { struct inet6_dev *idev; rcu_read_lock(); idev = rcu_dereference(dev->ip6_ptr); if (idev) refcount_inc(&idev->refcnt); rcu_read_unlock(); return idev; } static inline struct neigh_parms *__in6_dev_nd_parms_get_rcu(const struct net_device *dev) { struct inet6_dev *idev = __in6_dev_get(dev); return idev ? idev->nd_parms : NULL; } void in6_dev_finish_destroy(struct inet6_dev *idev); static inline void in6_dev_put(struct inet6_dev *idev) { if (refcount_dec_and_test(&idev->refcnt)) in6_dev_finish_destroy(idev); } static inline void in6_dev_put_clear(struct inet6_dev **pidev) { struct inet6_dev *idev = *pidev; if (idev) { in6_dev_put(idev); *pidev = NULL; } } static inline void __in6_dev_put(struct inet6_dev *idev) { refcount_dec(&idev->refcnt); } static inline void in6_dev_hold(struct inet6_dev *idev) { refcount_inc(&idev->refcnt); } /* called with rcu_read_lock held */ static inline bool ip6_ignore_linkdown(const struct net_device *dev) { const struct inet6_dev *idev = __in6_dev_get(dev); return !!idev->cnf.ignore_routes_with_linkdown; } void inet6_ifa_finish_destroy(struct inet6_ifaddr *ifp); static inline void in6_ifa_put(struct inet6_ifaddr *ifp) { if (refcount_dec_and_test(&ifp->refcnt)) inet6_ifa_finish_destroy(ifp); } static inline void __in6_ifa_put(struct inet6_ifaddr *ifp) { refcount_dec(&ifp->refcnt); } static inline void in6_ifa_hold(struct inet6_ifaddr *ifp) { refcount_inc(&ifp->refcnt); } /* * compute link-local solicited-node multicast address */ static inline void addrconf_addr_solict_mult(const struct in6_addr *addr, struct in6_addr *solicited) { ipv6_addr_set(solicited, htonl(0xFF020000), 0, htonl(0x1), htonl(0xFF000000) | addr->s6_addr32[3]); } static inline bool ipv6_addr_is_ll_all_nodes(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(1))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000001))) == 0; #endif } static inline bool ipv6_addr_is_ll_all_routers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(2))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x00000002))) == 0; #endif } static inline bool ipv6_addr_is_isatap(const struct in6_addr *addr) { return (addr->s6_addr32[2] | htonl(0x02000000)) == htonl(0x02005EFE); } static inline bool ipv6_addr_is_solict_mult(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | ((p[1] ^ cpu_to_be64(0x00000001ff000000UL)) & cpu_to_be64(0xffffffffff000000UL))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x00000001)) | (addr->s6_addr[12] ^ 0xff)) == 0; #endif } static inline bool ipv6_addr_is_all_snoopers(const struct in6_addr *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 __be64 *p = (__force __be64 *)addr; return ((p[0] ^ cpu_to_be64(0xff02000000000000UL)) | (p[1] ^ cpu_to_be64(0x6a))) == 0UL; #else return ((addr->s6_addr32[0] ^ htonl(0xff020000)) | addr->s6_addr32[1] | addr->s6_addr32[2] | (addr->s6_addr32[3] ^ htonl(0x0000006a))) == 0; #endif } #ifdef CONFIG_PROC_FS int if6_proc_init(void); void if6_proc_exit(void); #endif #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RMAP_H #define _LINUX_RMAP_H /* * Declarations for Reverse Mapping functions in mm/rmap.c */ #include <linux/list.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/rwsem.h> #include <linux/memcontrol.h> #include <linux/highmem.h> /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: * the vmas on the list will be related by forking, or by splitting. * * Since vmas come and go as they are split and merged (particularly * in mprotect), the mapping field of an anonymous page cannot point * directly to a vma: instead it points to an anon_vma, on whose list * the related vmas can be easily linked or unlinked. * * After unlinking the last vma on the list, we must garbage collect * the anon_vma object itself: we're guaranteed no page can be * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { struct anon_vma *root; /* Root of this anon_vma tree */ struct rw_semaphore rwsem; /* W: modification, R: walking the list */ /* * The refcount is taken on an anon_vma when there is no * guarantee that the vma of page tables will exist for * the duration of the operation. A caller that takes * the reference is responsible for clearing up the * anon_vma if they are the last user on release */ atomic_t refcount; /* * Count of child anon_vmas and VMAs which points to this anon_vma. * * This counter is used for making decision about reusing anon_vma * instead of forking new one. See comments in function anon_vma_clone. */ unsigned degree; struct anon_vma *parent; /* Parent of this anon_vma */ /* * NOTE: the LSB of the rb_root.rb_node is set by * mm_take_all_locks() _after_ taking the above lock. So the * rb_root must only be read/written after taking the above lock * to be sure to see a valid next pointer. The LSB bit itself * is serialized by a system wide lock only visible to * mm_take_all_locks() (mm_all_locks_mutex). */ /* Interval tree of private "related" vmas */ struct rb_root_cached rb_root; }; /* * The copy-on-write semantics of fork mean that an anon_vma * can become associated with multiple processes. Furthermore, * each child process will have its own anon_vma, where new * pages for that process are instantiated. * * This structure allows us to find the anon_vmas associated * with a VMA, or the VMAs associated with an anon_vma. * The "same_vma" list contains the anon_vma_chains linking * all the anon_vmas associated with this VMA. * The "rb" field indexes on an interval tree the anon_vma_chains * which link all the VMAs associated with this anon_vma. */ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ struct rb_node rb; /* locked by anon_vma->rwsem */ unsigned long rb_subtree_last; #ifdef CONFIG_DEBUG_VM_RB unsigned long cached_vma_start, cached_vma_last; #endif }; enum ttu_flags { TTU_MIGRATION = 0x1, /* migration mode */ TTU_MUNLOCK = 0x2, /* munlock mode */ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: * caller holds it */ TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ }; #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) { atomic_inc(&anon_vma->refcount); } void __put_anon_vma(struct anon_vma *anon_vma); static inline void put_anon_vma(struct anon_vma *anon_vma) { if (atomic_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma); } static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); } static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { up_write(&anon_vma->root->rwsem); } static inline void anon_vma_lock_read(struct anon_vma *anon_vma) { down_read(&anon_vma->root->rwsem); } static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); } /* * anon_vma helper functions. */ void anon_vma_init(void); /* create anon_vma_cachep */ int __anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); static inline int anon_vma_prepare(struct vm_area_struct *vma) { if (likely(vma->anon_vma)) return 0; return __anon_vma_prepare(vma); } static inline void anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); unlink_anon_vmas(next); } struct anon_vma *page_get_anon_vma(struct page *page); /* bitflags for do_page_add_anon_rmap() */ #define RMAP_EXCLUSIVE 0x01 #define RMAP_COMPOUND 0x02 /* * rmap interfaces called when adding or removing pte of page */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); void page_add_file_rmap(struct page *, bool); void page_remove_rmap(struct page *, bool); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); static inline void page_dup_rmap(struct page *page, bool compound) { atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } /* * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); bool try_to_unmap(struct page *, enum ttu_flags flags); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) /* Look for migarion entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) struct page_vma_mapped_walk { struct page *page; struct vm_area_struct *vma; unsigned long address; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; unsigned int flags; }; static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ if (pvmw->pte && !PageHuge(pvmw->page)) pte_unmap(pvmw->pte); if (pvmw->ptl) spin_unlock(pvmw->ptl); } bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); /* * Used by swapoff to help locate where page is expected in vma. */ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); /* * Cleans the PTEs of shared mappings. * (and since clean PTEs should also be readonly, write protects them too) * * returns the number of cleaned PTEs. */ int page_mkclean(struct page *); /* * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ void try_to_munlock(struct page *); void remove_migration_ptes(struct page *old, struct page *new, bool locked); /* * Called by memory-failure.c to kill processes. */ struct anon_vma *page_lock_anon_vma_read(struct page *page); void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ struct rmap_walk_control { void *arg; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. */ bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; void rmap_walk(struct page *page, struct rmap_walk_control *rwc); void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) #define anon_vma_prepare(vma) (0) #define anon_vma_link(vma) do {} while (0) static inline int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) { *vm_flags = 0; return 0; } #define try_to_unmap(page, refs) false static inline int page_mkclean(struct page *page) { return 0; } #endif /* CONFIG_MMU */ #endif /* _LINUX_RMAP_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 // SPDX-License-Identifier: GPL-2.0-or-later /* Common capabilities, needed by capability.o. */ #include <linux/capability.h> #include <linux/audit.h> #include <linux/init.h> #include <linux/kernel.h> #include <linux/lsm_hooks.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/skbuff.h> #include <linux/netlink.h> #include <linux/ptrace.h> #include <linux/xattr.h> #include <linux/hugetlb.h> #include <linux/mount.h> #include <linux/sched.h> #include <linux/prctl.h> #include <linux/securebits.h> #include <linux/user_namespace.h> #include <linux/binfmts.h> #include <linux/personality.h> /* * If a non-root user executes a setuid-root binary in * !secure(SECURE_NOROOT) mode, then we raise capabilities. * However if fE is also set, then the intent is for only * the file capabilities to be applied, and the setuid-root * bit is left on either to change the uid (plausible) or * to get full privilege on a kernel without file capabilities * support. So in that case we do not raise capabilities. * * Warn if that happens, once per boot. */ static void warn_setuid_and_fcaps_mixed(const char *fname) { static int warned; if (!warned) { printk(KERN_INFO "warning: `%s' has both setuid-root and" " effective capabilities. Therefore not raising all" " capabilities.\n", fname); warned = 1; } } /** * cap_capable - Determine whether a task has a particular effective capability * @cred: The credentials to use * @ns: The user namespace in which we need the capability * @cap: The capability to check for * @opts: Bitmask of options defined in include/linux/security.h * * Determine whether the nominated task has the specified capability amongst * its effective set, returning 0 if it does, -ve if it does not. * * NOTE WELL: cap_has_capability() cannot be used like the kernel's capable() * and has_capability() functions. That is, it has the reverse semantics: * cap_has_capability() returns 0 when a task has a capability, but the * kernel's capable() and has_capability() returns 1 for this case. */ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns, int cap, unsigned int opts) { struct user_namespace *ns = targ_ns; /* See if cred has the capability in the target user namespace * by examining the target user namespace and all of the target * user namespace's parents. */ for (;;) { /* Do we have the necessary capabilities? */ if (ns == cred->user_ns) return cap_raised(cred->cap_effective, cap) ? 0 : -EPERM; /* * If we're already at a lower level than we're looking for, * we're done searching. */ if (ns->level <= cred->user_ns->level) return -EPERM; /* * The owner of the user namespace in the parent of the * user namespace has all caps. */ if ((ns->parent == cred->user_ns) && uid_eq(ns->owner, cred->euid)) return 0; /* * If you have a capability in a parent user ns, then you have * it over all children user namespaces as well. */ ns = ns->parent; } /* We never get here */ } /** * cap_settime - Determine whether the current process may set the system clock * @ts: The time to set * @tz: The timezone to set * * Determine whether the current process may set the system clock and timezone * information, returning 0 if permission granted, -ve if denied. */ int cap_settime(const struct timespec64 *ts, const struct timezone *tz) { if (!capable(CAP_SYS_TIME)) return -EPERM; return 0; } /** * cap_ptrace_access_check - Determine whether the current process may access * another * @child: The process to be accessed * @mode: The mode of attachment. * * If we are in the same or an ancestor user_ns and have all the target * task's capabilities, then ptrace access is allowed. * If we have the ptrace capability to the target user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether a process may access another, returning 0 if permission * granted, -ve if denied. */ int cap_ptrace_access_check(struct task_struct *child, unsigned int mode) { int ret = 0; const struct cred *cred, *child_cred; const kernel_cap_t *caller_caps; rcu_read_lock(); cred = current_cred(); child_cred = __task_cred(child); if (mode & PTRACE_MODE_FSCREDS) caller_caps = &cred->cap_effective; else caller_caps = &cred->cap_permitted; if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, *caller_caps)) goto out; if (ns_capable(child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_ptrace_traceme - Determine whether another process may trace the current * @parent: The task proposed to be the tracer * * If parent is in the same or an ancestor user_ns and has all current's * capabilities, then ptrace access is allowed. * If parent has the ptrace capability to current's user_ns, then ptrace * access is allowed. * Else denied. * * Determine whether the nominated task is permitted to trace the current * process, returning 0 if permission is granted, -ve if denied. */ int cap_ptrace_traceme(struct task_struct *parent) { int ret = 0; const struct cred *cred, *child_cred; rcu_read_lock(); cred = __task_cred(parent); child_cred = current_cred(); if (cred->user_ns == child_cred->user_ns && cap_issubset(child_cred->cap_permitted, cred->cap_permitted)) goto out; if (has_ns_capability(parent, child_cred->user_ns, CAP_SYS_PTRACE)) goto out; ret = -EPERM; out: rcu_read_unlock(); return ret; } /** * cap_capget - Retrieve a task's capability sets * @target: The task from which to retrieve the capability sets * @effective: The place to record the effective set * @inheritable: The place to record the inheritable set * @permitted: The place to record the permitted set * * This function retrieves the capabilities of the nominated task and returns * them to the caller. */ int cap_capget(struct task_struct *target, kernel_cap_t *effective, kernel_cap_t *inheritable, kernel_cap_t *permitted) { const struct cred *cred; /* Derived from kernel/capability.c:sys_capget. */ rcu_read_lock(); cred = __task_cred(target); *effective = cred->cap_effective; *inheritable = cred->cap_inheritable; *permitted = cred->cap_permitted; rcu_read_unlock(); return 0; } /* * Determine whether the inheritable capabilities are limited to the old * permitted set. Returns 1 if they are limited, 0 if they are not. */ static inline int cap_inh_is_capped(void) { /* they are so limited unless the current task has the CAP_SETPCAP * capability */ if (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) == 0) return 0; return 1; } /** * cap_capset - Validate and apply proposed changes to current's capabilities * @new: The proposed new credentials; alterations should be made here * @old: The current task's current credentials * @effective: A pointer to the proposed new effective capabilities set * @inheritable: A pointer to the proposed new inheritable capabilities set * @permitted: A pointer to the proposed new permitted capabilities set * * This function validates and applies a proposed mass change to the current * process's capability sets. The changes are made to the proposed new * credentials, and assuming no error, will be committed by the caller of LSM. */ int cap_capset(struct cred *new, const struct cred *old, const kernel_cap_t *effective, const kernel_cap_t *inheritable, const kernel_cap_t *permitted) { if (cap_inh_is_capped() && !cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_permitted))) /* incapable of using this inheritable set */ return -EPERM; if (!cap_issubset(*inheritable, cap_combine(old->cap_inheritable, old->cap_bset))) /* no new pI capabilities outside bounding set */ return -EPERM; /* verify restrictions on target's new Permitted set */ if (!cap_issubset(*permitted, old->cap_permitted)) return -EPERM; /* verify the _new_Effective_ is a subset of the _new_Permitted_ */ if (!cap_issubset(*effective, *permitted)) return -EPERM; new->cap_effective = *effective; new->cap_inheritable = *inheritable; new->cap_permitted = *permitted; /* * Mask off ambient bits that are no longer both permitted and * inheritable. */ new->cap_ambient = cap_intersect(new->cap_ambient, cap_intersect(*permitted, *inheritable)); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EINVAL; return 0; } /** * cap_inode_need_killpriv - Determine if inode change affects privileges * @dentry: The inode/dentry in being changed with change marked ATTR_KILL_PRIV * * Determine if an inode having a change applied that's marked ATTR_KILL_PRIV * affects the security markings on that inode, and if it is, should * inode_killpriv() be invoked or the change rejected. * * Returns 1 if security.capability has a value, meaning inode_killpriv() * is required, 0 otherwise, meaning inode_killpriv() is not required. */ int cap_inode_need_killpriv(struct dentry *dentry) { struct inode *inode = d_backing_inode(dentry); int error; error = __vfs_getxattr(dentry, inode, XATTR_NAME_CAPS, NULL, 0); return error > 0; } /** * cap_inode_killpriv - Erase the security markings on an inode * @dentry: The inode/dentry to alter * * Erase the privilege-enhancing security markings on an inode. * * Returns 0 if successful, -ve on error. */ int cap_inode_killpriv(struct dentry *dentry) { int error; error = __vfs_removexattr(dentry, XATTR_NAME_CAPS); if (error == -EOPNOTSUPP) error = 0; return error; } static bool rootid_owns_currentns(kuid_t kroot) { struct user_namespace *ns; if (!uid_valid(kroot)) return false; for (ns = current_user_ns(); ; ns = ns->parent) { if (from_kuid(ns, kroot) == 0) return true; if (ns == &init_user_ns) break; } return false; } static __u32 sansflags(__u32 m) { return m & ~VFS_CAP_FLAGS_EFFECTIVE; } static bool is_v2header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_2) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_2; } static bool is_v3header(size_t size, const struct vfs_cap_data *cap) { if (size != XATTR_CAPS_SZ_3) return false; return sansflags(le32_to_cpu(cap->magic_etc)) == VFS_CAP_REVISION_3; } /* * getsecurity: We are called for security.* before any attempt to read the * xattr from the inode itself. * * This gives us a chance to read the on-disk value and convert it. If we * return -EOPNOTSUPP, then vfs_getxattr() will call the i_op handler. * * Note we are not called by vfs_getxattr_alloc(), but that is only called * by the integrity subsystem, which really wants the unconverted values - * so that's good. */ int cap_inode_getsecurity(struct inode *inode, const char *name, void **buffer, bool alloc) { int size, ret; kuid_t kroot; u32 nsmagic, magic; uid_t root, mappedroot; char *tmpbuf = NULL; struct vfs_cap_data *cap; struct vfs_ns_cap_data *nscap = NULL; struct dentry *dentry; struct user_namespace *fs_ns; if (strcmp(name, "capability") != 0) return -EOPNOTSUPP; dentry = d_find_any_alias(inode); if (!dentry) return -EINVAL; size = sizeof(struct vfs_ns_cap_data); ret = (int) vfs_getxattr_alloc(dentry, XATTR_NAME_CAPS, &tmpbuf, size, GFP_NOFS); dput(dentry); if (ret < 0 || !tmpbuf) return ret; fs_ns = inode->i_sb->s_user_ns; cap = (struct vfs_cap_data *) tmpbuf; if (is_v2header((size_t) ret, cap)) { root = 0; } else if (is_v3header((size_t) ret, cap)) { nscap = (struct vfs_ns_cap_data *) tmpbuf; root = le32_to_cpu(nscap->rootid); } else { size = -EINVAL; goto out_free; } kroot = make_kuid(fs_ns, root); /* If the root kuid maps to a valid uid in current ns, then return * this as a nscap. */ mappedroot = from_kuid(current_user_ns(), kroot); if (mappedroot != (uid_t)-1 && mappedroot != (uid_t)0) { size = sizeof(struct vfs_ns_cap_data); if (alloc) { if (!nscap) { /* v2 -> v3 conversion */ nscap = kzalloc(size, GFP_ATOMIC); if (!nscap) { size = -ENOMEM; goto out_free; } nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); nscap->magic_etc = cpu_to_le32(nsmagic); } else { /* use allocated v3 buffer */ tmpbuf = NULL; } nscap->rootid = cpu_to_le32(mappedroot); *buffer = nscap; } goto out_free; } if (!rootid_owns_currentns(kroot)) { size = -EOVERFLOW; goto out_free; } /* This comes from a parent namespace. Return as a v2 capability */ size = sizeof(struct vfs_cap_data); if (alloc) { if (nscap) { /* v3 -> v2 conversion */ cap = kzalloc(size, GFP_ATOMIC); if (!cap) { size = -ENOMEM; goto out_free; } magic = VFS_CAP_REVISION_2; nsmagic = le32_to_cpu(nscap->magic_etc); if (nsmagic & VFS_CAP_FLAGS_EFFECTIVE) magic |= VFS_CAP_FLAGS_EFFECTIVE; memcpy(&cap->data, &nscap->data, sizeof(__le32) * 2 * VFS_CAP_U32); cap->magic_etc = cpu_to_le32(magic); } else { /* use unconverted v2 */ tmpbuf = NULL; } *buffer = cap; } out_free: kfree(tmpbuf); return size; } static kuid_t rootid_from_xattr(const void *value, size_t size, struct user_namespace *task_ns) { const struct vfs_ns_cap_data *nscap = value; uid_t rootid = 0; if (size == XATTR_CAPS_SZ_3) rootid = le32_to_cpu(nscap->rootid); return make_kuid(task_ns, rootid); } static bool validheader(size_t size, const struct vfs_cap_data *cap) { return is_v2header(size, cap) || is_v3header(size, cap); } /* * User requested a write of security.capability. If needed, update the * xattr to change from v2 to v3, or to fixup the v3 rootid. * * If all is ok, we return the new size, on error return < 0. */ int cap_convert_nscap(struct dentry *dentry, void **ivalue, size_t size) { struct vfs_ns_cap_data *nscap; uid_t nsrootid; const struct vfs_cap_data *cap = *ivalue; __u32 magic, nsmagic; struct inode *inode = d_backing_inode(dentry); struct user_namespace *task_ns = current_user_ns(), *fs_ns = inode->i_sb->s_user_ns; kuid_t rootid; size_t newsize; if (!*ivalue) return -EINVAL; if (!validheader(size, cap)) return -EINVAL; if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; if (size == XATTR_CAPS_SZ_2) if (ns_capable(inode->i_sb->s_user_ns, CAP_SETFCAP)) /* user is privileged, just write the v2 */ return size; rootid = rootid_from_xattr(*ivalue, size, task_ns); if (!uid_valid(rootid)) return -EINVAL; nsrootid = from_kuid(fs_ns, rootid); if (nsrootid == -1) return -EINVAL; newsize = sizeof(struct vfs_ns_cap_data); nscap = kmalloc(newsize, GFP_ATOMIC); if (!nscap) return -ENOMEM; nscap->rootid = cpu_to_le32(nsrootid); nsmagic = VFS_CAP_REVISION_3; magic = le32_to_cpu(cap->magic_etc); if (magic & VFS_CAP_FLAGS_EFFECTIVE) nsmagic |= VFS_CAP_FLAGS_EFFECTIVE; nscap->magic_etc = cpu_to_le32(nsmagic); memcpy(&nscap->data, &cap->data, sizeof(__le32) * 2 * VFS_CAP_U32); kvfree(*ivalue); *ivalue = nscap; return newsize; } /* * Calculate the new process capability sets from the capability sets attached * to a file. */ static inline int bprm_caps_from_vfs_caps(struct cpu_vfs_cap_data *caps, struct linux_binprm *bprm, bool *effective, bool *has_fcap) { struct cred *new = bprm->cred; unsigned i; int ret = 0; if (caps->magic_etc & VFS_CAP_FLAGS_EFFECTIVE) *effective = true; if (caps->magic_etc & VFS_CAP_REVISION_MASK) *has_fcap = true; CAP_FOR_EACH_U32(i) { __u32 permitted = caps->permitted.cap[i]; __u32 inheritable = caps->inheritable.cap[i]; /* * pP' = (X & fP) | (pI & fI) * The addition of pA' is handled later. */ new->cap_permitted.cap[i] = (new->cap_bset.cap[i] & permitted) | (new->cap_inheritable.cap[i] & inheritable); if (permitted & ~new->cap_permitted.cap[i]) /* insufficient to execute correctly */ ret = -EPERM; } /* * For legacy apps, with no internal support for recognizing they * do not have enough capabilities, we return an error if they are * missing some "forced" (aka file-permitted) capabilities. */ return *effective ? ret : 0; } /* * Extract the on-exec-apply capability sets for an executable file. */ int get_vfs_caps_from_disk(const struct dentry *dentry, struct cpu_vfs_cap_data *cpu_caps) { struct inode *inode = d_backing_inode(dentry); __u32 magic_etc; unsigned tocopy, i; int size; struct vfs_ns_cap_data data, *nscaps = &data; struct vfs_cap_data *caps = (struct vfs_cap_data *) &data; kuid_t rootkuid; struct user_namespace *fs_ns; memset(cpu_caps, 0, sizeof(struct cpu_vfs_cap_data)); if (!inode) return -ENODATA; fs_ns = inode->i_sb->s_user_ns; size = __vfs_getxattr((struct dentry *)dentry, inode, XATTR_NAME_CAPS, &data, XATTR_CAPS_SZ); if (size == -ENODATA || size == -EOPNOTSUPP) /* no data, that's ok */ return -ENODATA; if (size < 0) return size; if (size < sizeof(magic_etc)) return -EINVAL; cpu_caps->magic_etc = magic_etc = le32_to_cpu(caps->magic_etc); rootkuid = make_kuid(fs_ns, 0); switch (magic_etc & VFS_CAP_REVISION_MASK) { case VFS_CAP_REVISION_1: if (size != XATTR_CAPS_SZ_1) return -EINVAL; tocopy = VFS_CAP_U32_1; break; case VFS_CAP_REVISION_2: if (size != XATTR_CAPS_SZ_2) return -EINVAL; tocopy = VFS_CAP_U32_2; break; case VFS_CAP_REVISION_3: if (size != XATTR_CAPS_SZ_3) return -EINVAL; tocopy = VFS_CAP_U32_3; rootkuid = make_kuid(fs_ns, le32_to_cpu(nscaps->rootid)); break; default: return -EINVAL; } /* Limit the caps to the mounter of the filesystem * or the more limited uid specified in the xattr. */ if (!rootid_owns_currentns(rootkuid)) return -ENODATA; CAP_FOR_EACH_U32(i) { if (i >= tocopy) break; cpu_caps->permitted.cap[i] = le32_to_cpu(caps->data[i].permitted); cpu_caps->inheritable.cap[i] = le32_to_cpu(caps->data[i].inheritable); } cpu_caps->permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; cpu_caps->rootid = rootkuid; return 0; } /* * Attempt to get the on-exec apply capability sets for an executable file from * its xattrs and, if present, apply them to the proposed credentials being * constructed by execve(). */ static int get_file_caps(struct linux_binprm *bprm, struct file *file, bool *effective, bool *has_fcap) { int rc = 0; struct cpu_vfs_cap_data vcaps; cap_clear(bprm->cred->cap_permitted); if (!file_caps_enabled) return 0; if (!mnt_may_suid(file->f_path.mnt)) return 0; /* * This check is redundant with mnt_may_suid() but is kept to make * explicit that capability bits are limited to s_user_ns and its * descendants. */ if (!current_in_userns(file->f_path.mnt->mnt_sb->s_user_ns)) return 0; rc = get_vfs_caps_from_disk(file->f_path.dentry, &vcaps); if (rc < 0) { if (rc == -EINVAL) printk(KERN_NOTICE "Invalid argument reading file caps for %s\n", bprm->filename); else if (rc == -ENODATA) rc = 0; goto out; } rc = bprm_caps_from_vfs_caps(&vcaps, bprm, effective, has_fcap); out: if (rc) cap_clear(bprm->cred->cap_permitted); return rc; } static inline bool root_privileged(void) { return !issecure(SECURE_NOROOT); } static inline bool __is_real(kuid_t uid, struct cred *cred) { return uid_eq(cred->uid, uid); } static inline bool __is_eff(kuid_t uid, struct cred *cred) { return uid_eq(cred->euid, uid); } static inline bool __is_suid(kuid_t uid, struct cred *cred) { return !__is_real(uid, cred) && __is_eff(uid, cred); } /* * handle_privileged_root - Handle case of privileged root * @bprm: The execution parameters, including the proposed creds * @has_fcap: Are any file capabilities set? * @effective: Do we have effective root privilege? * @root_uid: This namespace' root UID WRT initial USER namespace * * Handle the case where root is privileged and hasn't been neutered by * SECURE_NOROOT. If file capabilities are set, they won't be combined with * set UID root and nothing is changed. If we are root, cap_permitted is * updated. If we have become set UID root, the effective bit is set. */ static void handle_privileged_root(struct linux_binprm *bprm, bool has_fcap, bool *effective, kuid_t root_uid) { const struct cred *old = current_cred(); struct cred *new = bprm->cred; if (!root_privileged()) return; /* * If the legacy file capability is set, then don't set privs * for a setuid root binary run by a non-root user. Do set it * for a root user just to cause least surprise to an admin. */ if (has_fcap && __is_suid(root_uid, new)) { warn_setuid_and_fcaps_mixed(bprm->filename); return; } /* * To support inheritance of root-permissions and suid-root * executables under compatibility mode, we override the * capability sets for the file. */ if (__is_eff(root_uid, new) || __is_real(root_uid, new)) { /* pP' = (cap_bset & ~0) | (pI & ~0) */ new->cap_permitted = cap_combine(old->cap_bset, old->cap_inheritable); } /* * If only the real uid is 0, we do not set the effective bit. */ if (__is_eff(root_uid, new)) *effective = true; } #define __cap_gained(field, target, source) \ !cap_issubset(target->cap_##field, source->cap_##field) #define __cap_grew(target, source, cred) \ !cap_issubset(cred->cap_##target, cred->cap_##source) #define __cap_full(field, cred) \ cap_issubset(CAP_FULL_SET, cred->cap_##field) static inline bool __is_setuid(struct cred *new, const struct cred *old) { return !uid_eq(new->euid, old->uid); } static inline bool __is_setgid(struct cred *new, const struct cred *old) { return !gid_eq(new->egid, old->gid); } /* * 1) Audit candidate if current->cap_effective is set * * We do not bother to audit if 3 things are true: * 1) cap_effective has all caps * 2) we became root *OR* are were already root * 3) root is supposed to have all caps (SECURE_NOROOT) * Since this is just a normal root execing a process. * * Number 1 above might fail if you don't have a full bset, but I think * that is interesting information to audit. * * A number of other conditions require logging: * 2) something prevented setuid root getting all caps * 3) non-setuid root gets fcaps * 4) non-setuid root gets ambient */ static inline bool nonroot_raised_pE(struct cred *new, const struct cred *old, kuid_t root, bool has_fcap) { bool ret = false; if ((__cap_grew(effective, ambient, new) && !(__cap_full(effective, new) && (__is_eff(root, new) || __is_real(root, new)) && root_privileged())) || (root_privileged() && __is_suid(root, new) && !__cap_full(effective, new)) || (!__is_setuid(new, old) && ((has_fcap && __cap_gained(permitted, new, old)) || __cap_gained(ambient, new, old)))) ret = true; return ret; } /** * cap_bprm_creds_from_file - Set up the proposed credentials for execve(). * @bprm: The execution parameters, including the proposed creds * @file: The file to pull the credentials from * * Set up the proposed credentials for a new execution context being * constructed by execve(). The proposed creds in @bprm->cred is altered, * which won't take effect immediately. Returns 0 if successful, -ve on error. */ int cap_bprm_creds_from_file(struct linux_binprm *bprm, struct file *file) { /* Process setpcap binaries and capabilities for uid 0 */ const struct cred *old = current_cred(); struct cred *new = bprm->cred; bool effective = false, has_fcap = false, is_setid; int ret; kuid_t root_uid; if (WARN_ON(!cap_ambient_invariant_ok(old))) return -EPERM; ret = get_file_caps(bprm, file, &effective, &has_fcap); if (ret < 0) return ret; root_uid = make_kuid(new->user_ns, 0); handle_privileged_root(bprm, has_fcap, &effective, root_uid); /* if we have fs caps, clear dangerous personality flags */ if (__cap_gained(permitted, new, old)) bprm->per_clear |= PER_CLEAR_ON_SETID; /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit. * * In addition, if NO_NEW_PRIVS, then ensure we get no new privs. */ is_setid = __is_setuid(new, old) || __is_setgid(new, old); if ((is_setid || __cap_gained(permitted, new, old)) && ((bprm->unsafe & ~LSM_UNSAFE_PTRACE) || !ptracer_capable(current, new->user_ns))) { /* downgrade; they get no more than they had, and maybe less */ if (!ns_capable(new->user_ns, CAP_SETUID) || (bprm->unsafe & LSM_UNSAFE_NO_NEW_PRIVS)) { new->euid = new->uid; new->egid = new->gid; } new->cap_permitted = cap_intersect(new->cap_permitted, old->cap_permitted); } new->suid = new->fsuid = new->euid; new->sgid = new->fsgid = new->egid; /* File caps or setid cancels ambient. */ if (has_fcap || is_setid) cap_clear(new->cap_ambient); /* * Now that we've computed pA', update pP' to give: * pP' = (X & fP) | (pI & fI) | pA' */ new->cap_permitted = cap_combine(new->cap_permitted, new->cap_ambient); /* * Set pE' = (fE ? pP' : pA'). Because pA' is zero if fE is set, * this is the same as pE' = (fE ? pP' : 0) | pA'. */ if (effective) new->cap_effective = new->cap_permitted; else new->cap_effective = new->cap_ambient; if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; if (nonroot_raised_pE(new, old, root_uid, has_fcap)) { ret = audit_log_bprm_fcaps(bprm, new, old); if (ret < 0) return ret; } new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); if (WARN_ON(!cap_ambient_invariant_ok(new))) return -EPERM; /* Check for privilege-elevated exec. */ if (is_setid || (!__is_real(root_uid, new) && (effective || __cap_grew(permitted, ambient, new)))) bprm->secureexec = 1; return 0; } /** * cap_inode_setxattr - Determine whether an xattr may be altered * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * @value: The value that the xattr will be changed to * @size: The size of value * @flags: The replacement flag * * Determine whether an xattr may be altered or set on an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get updated or set by those * who aren't privileged to do so. */ int cap_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; /* * For XATTR_NAME_CAPS the check will be done in * cap_convert_nscap(), called by setxattr() */ if (strcmp(name, XATTR_NAME_CAPS) == 0) return 0; if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /** * cap_inode_removexattr - Determine whether an xattr may be removed * @dentry: The inode/dentry being altered * @name: The name of the xattr to be changed * * Determine whether an xattr may be removed from an inode, returning 0 if * permission is granted, -ve if denied. * * This is used to make sure security xattrs don't get removed by those who * aren't privileged to remove them. */ int cap_inode_removexattr(struct dentry *dentry, const char *name) { struct user_namespace *user_ns = dentry->d_sb->s_user_ns; /* Ignore non-security xattrs */ if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) != 0) return 0; if (strcmp(name, XATTR_NAME_CAPS) == 0) { /* security.capability gets namespaced */ struct inode *inode = d_backing_inode(dentry); if (!inode) return -EINVAL; if (!capable_wrt_inode_uidgid(inode, CAP_SETFCAP)) return -EPERM; return 0; } if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } /* * cap_emulate_setxuid() fixes the effective / permitted capabilities of * a process after a call to setuid, setreuid, or setresuid. * * 1) When set*uiding _from_ one of {r,e,s}uid == 0 _to_ all of * {r,e,s}uid != 0, the permitted and effective capabilities are * cleared. * * 2) When set*uiding _from_ euid == 0 _to_ euid != 0, the effective * capabilities of the process are cleared. * * 3) When set*uiding _from_ euid != 0 _to_ euid == 0, the effective * capabilities are set to the permitted capabilities. * * fsuid is handled elsewhere. fsuid == 0 and {r,e,s}uid!= 0 should * never happen. * * -astor * * cevans - New behaviour, Oct '99 * A process may, via prctl(), elect to keep its capabilities when it * calls setuid() and switches away from uid==0. Both permitted and * effective sets will be retained. * Without this change, it was impossible for a daemon to drop only some * of its privilege. The call to setuid(!=0) would drop all privileges! * Keeping uid 0 is not an option because uid 0 owns too many vital * files.. * Thanks to Olaf Kirch and Peter Benie for spotting this. */ static inline void cap_emulate_setxuid(struct cred *new, const struct cred *old) { kuid_t root_uid = make_kuid(old->user_ns, 0); if ((uid_eq(old->uid, root_uid) || uid_eq(old->euid, root_uid) || uid_eq(old->suid, root_uid)) && (!uid_eq(new->uid, root_uid) && !uid_eq(new->euid, root_uid) && !uid_eq(new->suid, root_uid))) { if (!issecure(SECURE_KEEP_CAPS)) { cap_clear(new->cap_permitted); cap_clear(new->cap_effective); } /* * Pre-ambient programs expect setresuid to nonroot followed * by exec to drop capabilities. We should make sure that * this remains the case. */ cap_clear(new->cap_ambient); } if (uid_eq(old->euid, root_uid) && !uid_eq(new->euid, root_uid)) cap_clear(new->cap_effective); if (!uid_eq(old->euid, root_uid) && uid_eq(new->euid, root_uid)) new->cap_effective = new->cap_permitted; } /** * cap_task_fix_setuid - Fix up the results of setuid() call * @new: The proposed credentials * @old: The current task's current credentials * @flags: Indications of what has changed * * Fix up the results of setuid() call before the credential changes are * actually applied, returning 0 to grant the changes, -ve to deny them. */ int cap_task_fix_setuid(struct cred *new, const struct cred *old, int flags) { switch (flags) { case LSM_SETID_RE: case LSM_SETID_ID: case LSM_SETID_RES: /* juggle the capabilities to follow [RES]UID changes unless * otherwise suppressed */ if (!issecure(SECURE_NO_SETUID_FIXUP)) cap_emulate_setxuid(new, old); break; case LSM_SETID_FS: /* juggle the capabilties to follow FSUID changes, unless * otherwise suppressed * * FIXME - is fsuser used for all CAP_FS_MASK capabilities? * if not, we might be a bit too harsh here. */ if (!issecure(SECURE_NO_SETUID_FIXUP)) { kuid_t root_uid = make_kuid(old->user_ns, 0); if (uid_eq(old->fsuid, root_uid) && !uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_drop_fs_set(new->cap_effective); if (!uid_eq(old->fsuid, root_uid) && uid_eq(new->fsuid, root_uid)) new->cap_effective = cap_raise_fs_set(new->cap_effective, new->cap_permitted); } break; default: return -EINVAL; } return 0; } /* * Rationale: code calling task_setscheduler, task_setioprio, and * task_setnice, assumes that * . if capable(cap_sys_nice), then those actions should be allowed * . if not capable(cap_sys_nice), but acting on your own processes, * then those actions should be allowed * This is insufficient now since you can call code without suid, but * yet with increased caps. * So we check for increased caps on the target process. */ static int cap_safe_nice(struct task_struct *p) { int is_subset, ret = 0; rcu_read_lock(); is_subset = cap_issubset(__task_cred(p)->cap_permitted, current_cred()->cap_permitted); if (!is_subset && !ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) ret = -EPERM; rcu_read_unlock(); return ret; } /** * cap_task_setscheduler - Detemine if scheduler policy change is permitted * @p: The task to affect * * Detemine if the requested scheduler policy change is permitted for the * specified task, returning 0 if permission is granted, -ve if denied. */ int cap_task_setscheduler(struct task_struct *p) { return cap_safe_nice(p); } /** * cap_task_ioprio - Detemine if I/O priority change is permitted * @p: The task to affect * @ioprio: The I/O priority to set * * Detemine if the requested I/O priority change is permitted for the specified * task, returning 0 if permission is granted, -ve if denied. */ int cap_task_setioprio(struct task_struct *p, int ioprio) { return cap_safe_nice(p); } /** * cap_task_ioprio - Detemine if task priority change is permitted * @p: The task to affect * @nice: The nice value to set * * Detemine if the requested task priority change is permitted for the * specified task, returning 0 if permission is granted, -ve if denied. */ int cap_task_setnice(struct task_struct *p, int nice) { return cap_safe_nice(p); } /* * Implement PR_CAPBSET_DROP. Attempt to remove the specified capability from * the current task's bounding set. Returns 0 on success, -ve on error. */ static int cap_prctl_drop(unsigned long cap) { struct cred *new; if (!ns_capable(current_user_ns(), CAP_SETPCAP)) return -EPERM; if (!cap_valid(cap)) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_lower(new->cap_bset, cap); return commit_creds(new); } /** * cap_task_prctl - Implement process control functions for this security module * @option: The process control function requested * @arg2, @arg3, @arg4, @arg5: The argument data for this function * * Allow process control functions (sys_prctl()) to alter capabilities; may * also deny access to other functions not otherwise implemented here. * * Returns 0 or +ve on success, -ENOSYS if this function is not implemented * here, other -ve on error. If -ENOSYS is returned, sys_prctl() and other LSM * modules will consider performing the function. */ int cap_task_prctl(int option, unsigned long arg2, unsigned long arg3, unsigned long arg4, unsigned long arg5) { const struct cred *old = current_cred(); struct cred *new; switch (option) { case PR_CAPBSET_READ: if (!cap_valid(arg2)) return -EINVAL; return !!cap_raised(old->cap_bset, arg2); case PR_CAPBSET_DROP: return cap_prctl_drop(arg2); /* * The next four prctl's remain to assist with transitioning a * system from legacy UID=0 based privilege (when filesystem * capabilities are not in use) to a system using filesystem * capabilities only - as the POSIX.1e draft intended. * * Note: * * PR_SET_SECUREBITS = * issecure_mask(SECURE_KEEP_CAPS_LOCKED) * | issecure_mask(SECURE_NOROOT) * | issecure_mask(SECURE_NOROOT_LOCKED) * | issecure_mask(SECURE_NO_SETUID_FIXUP) * | issecure_mask(SECURE_NO_SETUID_FIXUP_LOCKED) * * will ensure that the current process and all of its * children will be locked into a pure * capability-based-privilege environment. */ case PR_SET_SECUREBITS: if ((((old->securebits & SECURE_ALL_LOCKS) >> 1) & (old->securebits ^ arg2)) /*[1]*/ || ((old->securebits & SECURE_ALL_LOCKS & ~arg2)) /*[2]*/ || (arg2 & ~(SECURE_ALL_LOCKS | SECURE_ALL_BITS)) /*[3]*/ || (cap_capable(current_cred(), current_cred()->user_ns, CAP_SETPCAP, CAP_OPT_NONE) != 0) /*[4]*/ /* * [1] no changing of bits that are locked * [2] no unlocking of locks * [3] no setting of unsupported bits * [4] doing anything requires privilege (go read about * the "sendmail capabilities bug") */ ) /* cannot change a locked bit */ return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; new->securebits = arg2; return commit_creds(new); case PR_GET_SECUREBITS: return old->securebits; case PR_GET_KEEPCAPS: return !!issecure(SECURE_KEEP_CAPS); case PR_SET_KEEPCAPS: if (arg2 > 1) /* Note, we rely on arg2 being unsigned here */ return -EINVAL; if (issecure(SECURE_KEEP_CAPS_LOCKED)) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2) new->securebits |= issecure_mask(SECURE_KEEP_CAPS); else new->securebits &= ~issecure_mask(SECURE_KEEP_CAPS); return commit_creds(new); case PR_CAP_AMBIENT: if (arg2 == PR_CAP_AMBIENT_CLEAR_ALL) { if (arg3 | arg4 | arg5) return -EINVAL; new = prepare_creds(); if (!new) return -ENOMEM; cap_clear(new->cap_ambient); return commit_creds(new); } if (((!cap_valid(arg3)) | arg4 | arg5)) return -EINVAL; if (arg2 == PR_CAP_AMBIENT_IS_SET) { return !!cap_raised(current_cred()->cap_ambient, arg3); } else if (arg2 != PR_CAP_AMBIENT_RAISE && arg2 != PR_CAP_AMBIENT_LOWER) { return -EINVAL; } else { if (arg2 == PR_CAP_AMBIENT_RAISE && (!cap_raised(current_cred()->cap_permitted, arg3) || !cap_raised(current_cred()->cap_inheritable, arg3) || issecure(SECURE_NO_CAP_AMBIENT_RAISE))) return -EPERM; new = prepare_creds(); if (!new) return -ENOMEM; if (arg2 == PR_CAP_AMBIENT_RAISE) cap_raise(new->cap_ambient, arg3); else cap_lower(new->cap_ambient, arg3); return commit_creds(new); } default: /* No functionality available - continue with default */ return -ENOSYS; } } /** * cap_vm_enough_memory - Determine whether a new virtual mapping is permitted * @mm: The VM space in which the new mapping is to be made * @pages: The size of the mapping * * Determine whether the allocation of a new virtual mapping by the current * task is permitted, returning 1 if permission is granted, 0 if not. */ int cap_vm_enough_memory(struct mm_struct *mm, long pages) { int cap_sys_admin = 0; if (cap_capable(current_cred(), &init_user_ns, CAP_SYS_ADMIN, CAP_OPT_NOAUDIT) == 0) cap_sys_admin = 1; return cap_sys_admin; } /* * cap_mmap_addr - check if able to map given addr * @addr: address attempting to be mapped * * If the process is attempting to map memory below dac_mmap_min_addr they need * CAP_SYS_RAWIO. The other parameters to this function are unused by the * capability security module. Returns 0 if this mapping should be allowed * -EPERM if not. */ int cap_mmap_addr(unsigned long addr) { int ret = 0; if (addr < dac_mmap_min_addr) { ret = cap_capable(current_cred(), &init_user_ns, CAP_SYS_RAWIO, CAP_OPT_NONE); /* set PF_SUPERPRIV if it turns out we allow the low mmap */ if (ret == 0) current->flags |= PF_SUPERPRIV; } return ret; } int cap_mmap_file(struct file *file, unsigned long reqprot, unsigned long prot, unsigned long flags) { return 0; } #ifdef CONFIG_SECURITY static struct security_hook_list capability_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(capable, cap_capable), LSM_HOOK_INIT(settime, cap_settime), LSM_HOOK_INIT(ptrace_access_check, cap_ptrace_access_check), LSM_HOOK_INIT(ptrace_traceme, cap_ptrace_traceme), LSM_HOOK_INIT(capget, cap_capget), LSM_HOOK_INIT(capset, cap_capset), LSM_HOOK_INIT(bprm_creds_from_file, cap_bprm_creds_from_file), LSM_HOOK_INIT(inode_need_killpriv, cap_inode_need_killpriv), LSM_HOOK_INIT(inode_killpriv, cap_inode_killpriv), LSM_HOOK_INIT(inode_getsecurity, cap_inode_getsecurity), LSM_HOOK_INIT(mmap_addr, cap_mmap_addr), LSM_HOOK_INIT(mmap_file, cap_mmap_file), LSM_HOOK_INIT(task_fix_setuid, cap_task_fix_setuid), LSM_HOOK_INIT(task_prctl, cap_task_prctl), LSM_HOOK_INIT(task_setscheduler, cap_task_setscheduler), LSM_HOOK_INIT(task_setioprio, cap_task_setioprio), LSM_HOOK_INIT(task_setnice, cap_task_setnice), LSM_HOOK_INIT(vm_enough_memory, cap_vm_enough_memory), }; static int __init capability_init(void) { security_add_hooks(capability_hooks, ARRAY_SIZE(capability_hooks), "capability"); return 0; } DEFINE_LSM(capability) = { .name = "capability", .order = LSM_ORDER_FIRST, .init = capability_init, }; #endif /* CONFIG_SECURITY */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> (C) 2002 David Woodhouse <dwmw2@infradead.org> (C) 2012 Michel Lespinasse <walken@google.com> linux/include/linux/rbtree_augmented.h */ #ifndef _LINUX_RBTREE_AUGMENTED_H #define _LINUX_RBTREE_AUGMENTED_H #include <linux/compiler.h> #include <linux/rbtree.h> #include <linux/rcupdate.h> /* * Please note - only struct rb_augment_callbacks and the prototypes for * rb_insert_augmented() and rb_erase_augmented() are intended to be public. * The rest are implementation details you are not expected to depend on. * * See Documentation/core-api/rbtree.rst for documentation and samples. */ struct rb_augment_callbacks { void (*propagate)(struct rb_node *node, struct rb_node *stop); void (*copy)(struct rb_node *old, struct rb_node *new); void (*rotate)(struct rb_node *old, struct rb_node *new); }; extern void __rb_insert_augmented(struct rb_node *node, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); /* * Fixup the rbtree and update the augmented information when rebalancing. * * On insertion, the user must update the augmented information on the path * leading to the inserted node, then call rb_link_node() as usual and * rb_insert_augmented() instead of the usual rb_insert_color() call. * If rb_insert_augmented() rebalances the rbtree, it will callback into * a user provided function to update the augmented information on the * affected subtrees. */ static inline void rb_insert_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { __rb_insert_augmented(node, root, augment->rotate); } static inline void rb_insert_augmented_cached(struct rb_node *node, struct rb_root_cached *root, bool newleft, const struct rb_augment_callbacks *augment) { if (newleft) root->rb_leftmost = node; rb_insert_augmented(node, &root->rb_root, augment); } /* * Template for declaring augmented rbtree callbacks (generic case) * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBAUGMENTED: name of field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that recomputes the RBAUGMENTED data */ #define RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBCOMPUTE) \ static inline void \ RBNAME ## _propagate(struct rb_node *rb, struct rb_node *stop) \ { \ while (rb != stop) { \ RBSTRUCT *node = rb_entry(rb, RBSTRUCT, RBFIELD); \ if (RBCOMPUTE(node, true)) \ break; \ rb = rb_parent(&node->RBFIELD); \ } \ } \ static inline void \ RBNAME ## _copy(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ } \ static void \ RBNAME ## _rotate(struct rb_node *rb_old, struct rb_node *rb_new) \ { \ RBSTRUCT *old = rb_entry(rb_old, RBSTRUCT, RBFIELD); \ RBSTRUCT *new = rb_entry(rb_new, RBSTRUCT, RBFIELD); \ new->RBAUGMENTED = old->RBAUGMENTED; \ RBCOMPUTE(old, false); \ } \ RBSTATIC const struct rb_augment_callbacks RBNAME = { \ .propagate = RBNAME ## _propagate, \ .copy = RBNAME ## _copy, \ .rotate = RBNAME ## _rotate \ }; /* * Template for declaring augmented rbtree callbacks, * computing RBAUGMENTED scalar as max(RBCOMPUTE(node)) for all subtree nodes. * * RBSTATIC: 'static' or empty * RBNAME: name of the rb_augment_callbacks structure * RBSTRUCT: struct type of the tree nodes * RBFIELD: name of struct rb_node field within RBSTRUCT * RBTYPE: type of the RBAUGMENTED field * RBAUGMENTED: name of RBTYPE field within RBSTRUCT holding data for subtree * RBCOMPUTE: name of function that returns the per-node RBTYPE scalar */ #define RB_DECLARE_CALLBACKS_MAX(RBSTATIC, RBNAME, RBSTRUCT, RBFIELD, \ RBTYPE, RBAUGMENTED, RBCOMPUTE) \ static inline bool RBNAME ## _compute_max(RBSTRUCT *node, bool exit) \ { \ RBSTRUCT *child; \ RBTYPE max = RBCOMPUTE(node); \ if (node->RBFIELD.rb_left) { \ child = rb_entry(node->RBFIELD.rb_left, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (node->RBFIELD.rb_right) { \ child = rb_entry(node->RBFIELD.rb_right, RBSTRUCT, RBFIELD); \ if (child->RBAUGMENTED > max) \ max = child->RBAUGMENTED; \ } \ if (exit && node->RBAUGMENTED == max) \ return true; \ node->RBAUGMENTED = max; \ return false; \ } \ RB_DECLARE_CALLBACKS(RBSTATIC, RBNAME, \ RBSTRUCT, RBFIELD, RBAUGMENTED, RBNAME ## _compute_max) #define RB_RED 0 #define RB_BLACK 1 #define __rb_parent(pc) ((struct rb_node *)(pc & ~3)) #define __rb_color(pc) ((pc) & 1) #define __rb_is_black(pc) __rb_color(pc) #define __rb_is_red(pc) (!__rb_color(pc)) #define rb_color(rb) __rb_color((rb)->__rb_parent_color) #define rb_is_red(rb) __rb_is_red((rb)->__rb_parent_color) #define rb_is_black(rb) __rb_is_black((rb)->__rb_parent_color) static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) { rb->__rb_parent_color = rb_color(rb) | (unsigned long)p; } static inline void rb_set_parent_color(struct rb_node *rb, struct rb_node *p, int color) { rb->__rb_parent_color = (unsigned long)p | color; } static inline void __rb_change_child(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) WRITE_ONCE(parent->rb_left, new); else WRITE_ONCE(parent->rb_right, new); } else WRITE_ONCE(root->rb_node, new); } static inline void __rb_change_child_rcu(struct rb_node *old, struct rb_node *new, struct rb_node *parent, struct rb_root *root) { if (parent) { if (parent->rb_left == old) rcu_assign_pointer(parent->rb_left, new); else rcu_assign_pointer(parent->rb_right, new); } else rcu_assign_pointer(root->rb_node, new); } extern void __rb_erase_color(struct rb_node *parent, struct rb_root *root, void (*augment_rotate)(struct rb_node *old, struct rb_node *new)); static __always_inline struct rb_node * __rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *child = node->rb_right; struct rb_node *tmp = node->rb_left; struct rb_node *parent, *rebalance; unsigned long pc; if (!tmp) { /* * Case 1: node to erase has no more than 1 child (easy!) * * Note that if there is one child it must be red due to 5) * and node must be black due to 4). We adjust colors locally * so as to bypass __rb_erase_color() later on. */ pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, child, parent, root); if (child) { child->__rb_parent_color = pc; rebalance = NULL; } else rebalance = __rb_is_black(pc) ? parent : NULL; tmp = parent; } else if (!child) { /* Still case 1, but this time the child is node->rb_left */ tmp->__rb_parent_color = pc = node->__rb_parent_color; parent = __rb_parent(pc); __rb_change_child(node, tmp, parent, root); rebalance = NULL; tmp = parent; } else { struct rb_node *successor = child, *child2; tmp = child->rb_left; if (!tmp) { /* * Case 2: node's successor is its right child * * (n) (s) * / \ / \ * (x) (s) -> (x) (c) * \ * (c) */ parent = successor; child2 = successor->rb_right; augment->copy(node, successor); } else { /* * Case 3: node's successor is leftmost under * node's right child subtree * * (n) (s) * / \ / \ * (x) (y) -> (x) (y) * / / * (p) (p) * / / * (s) (c) * \ * (c) */ do { parent = successor; successor = tmp; tmp = tmp->rb_left; } while (tmp); child2 = successor->rb_right; WRITE_ONCE(parent->rb_left, child2); WRITE_ONCE(successor->rb_right, child); rb_set_parent(child, successor); augment->copy(node, successor); augment->propagate(parent, successor); } tmp = node->rb_left; WRITE_ONCE(successor->rb_left, tmp); rb_set_parent(tmp, successor); pc = node->__rb_parent_color; tmp = __rb_parent(pc); __rb_change_child(node, successor, tmp, root); if (child2) { rb_set_parent_color(child2, parent, RB_BLACK); rebalance = NULL; } else { rebalance = rb_is_black(successor) ? parent : NULL; } successor->__rb_parent_color = pc; tmp = successor; } augment->propagate(tmp, NULL); return rebalance; } static __always_inline void rb_erase_augmented(struct rb_node *node, struct rb_root *root, const struct rb_augment_callbacks *augment) { struct rb_node *rebalance = __rb_erase_augmented(node, root, augment); if (rebalance) __rb_erase_color(rebalance, root, augment->rotate); } static __always_inline void rb_erase_augmented_cached(struct rb_node *node, struct rb_root_cached *root, const struct rb_augment_callbacks *augment) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase_augmented(node, &root->rb_root, augment); } #endif /* _LINUX_RBTREE_AUGMENTED_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.rst for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H #include <linux/lockdep_types.h> #include <linux/smp.h> #include <asm/percpu.h> struct task_struct; /* for sysctl */ extern int prove_locking; extern int lock_stat; #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; /* used by BFS to record whether "prev -> this" only has -(*R)-> */ u8 only_xr; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /** * struct lock_chain - lock dependency chain record * * @irq_context: the same as irq_context in held_lock below * @depth: the number of held locks in this chain * @base: the index in chain_hlocks for this chain * @entry: the collided lock chains in lock_chain hash list * @chain_key: the hash key of this lock_chain */ struct lock_chain { /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; #define MAX_LOCKDEP_KEYS_BITS 13 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) #define INITIAL_CHAIN_KEY -1 struct held_lock { /* * One-way hash of the dependency chain up to this point. We * hash the hashes step by step as the dependency chain grows. * * We use it for dependency-caching and we skip detection * passes and dependency-updates if there is a cache-hit, so * it is absolutely critical for 100% coverage of the validator * to have a unique key value for every unique dependency path * that can occur in the system, to make a unique hash value * as likely as possible - hence the 64-bit width. * * The task struct holds the current hash value (initialized * with zero), here we store the previous hash value: */ u64 prev_chain_key; unsigned long acquire_ip; struct lockdep_map *instance; struct lockdep_map *nest_lock; #ifdef CONFIG_LOCK_STAT u64 waittime_stamp; u64 holdtime_stamp; #endif /* * class_idx is zero-indexed; it points to the element in * lock_classes this held lock instance belongs to. class_idx is in * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. */ unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; /* * The lock-stack is unified in that the lock chains of interrupt * contexts nest ontop of process context chains, but we 'separate' * the hashes by starting with 0 if we cross into an interrupt * context, and we also keep do not add cross-context lock * dependencies - the lock usage graph walking covers that area * anyway, and we'd just unnecessarily increase the number of * dependencies otherwise. [Note: hardirq and softirq contexts * are separated from each other too.] * * The following field is used to detect when we cross into an * interrupt context: */ unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ unsigned int trylock:1; /* 16 bits */ unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* * Split the recrursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) #define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) /* * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due * to header dependencies. */ #define lockdep_off() \ do { \ current->lockdep_recursion += LOCKDEP_OFF; \ } while (0) #define lockdep_on() \ do { \ current->lockdep_recursion -= LOCKDEP_OFF; \ } while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); static inline void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer) { lockdep_init_map_type(lock, name, key, subclass, inner, LD_WAIT_INV, LD_LOCK_NORMAL); } static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV); } /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map_waits(&(lock)->dep_map, #key, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map_waits(&(lock)->dep_map, name, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map_waits(&(lock)->dep_map, #key, key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map_waits(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, unsigned long ip); /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert_held(l) do { \ WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) #define lockdep_assert_held_write(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ } while (0) #define lockdep_assert_held_read(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 1)); \ } while (0) #define lockdep_assert_held_once(l) do { \ WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \ } while (0) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) { } static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } static inline void lockdep_set_selftest_task(struct task_struct *task) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) static inline void lockdep_register_key(struct lock_class_key *key) { } static inline void lockdep_unregister_key(struct lock_class_key *key) { } #define lockdep_depth(tsk) (0) #define lockdep_is_held_type(l, r) (1) #define lockdep_assert_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_recursing(tsk) (0) #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #endif /* !LOCKDEP */ enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0) /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_LOCKDEP /* * On lockdep we dont want the hand-coded irq-enable of * _raw_*_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ #define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ LOCK_CONTENDED((_lock), (try), (lock)) #else /* CONFIG_LOCKDEP */ #define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ lockfl((_lock), (flags)) #endif /* CONFIG_LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* Variable used to make lockdep treat read_lock() as recursive in selftests */ #ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS extern unsigned int force_read_lock_recursive; #else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #define force_read_lock_recursive 0 #endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #ifdef CONFIG_LOCKDEP extern bool read_lock_is_recursive(void); #else /* CONFIG_LOCKDEP */ /* If !LOCKDEP, the value is meaningless */ #define read_lock_is_recursive() 0 #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) \ do { \ if (read_lock_is_recursive()) \ lock_acquire_shared_recursive(l, s, t, NULL, i); \ else \ lock_acquire_shared(l, s, t, NULL, i); \ } while (0) #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); #define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ this_cpu_read(hardirqs_enabled))); \ } while (0) #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) #else # define lockdep_assert_RT_in_threaded_ctx() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 #ifndef _LINUX_MMAP_LOCK_H #define _LINUX_MMAP_LOCK_H #include <linux/mmdebug.h> #define MMAP_LOCK_INITIALIZER(name) \ .mmap_lock = __RWSEM_INITIALIZER((name).mmap_lock), static inline void mmap_init_lock(struct mm_struct *mm) { init_rwsem(&mm->mmap_lock); } static inline void mmap_write_lock(struct mm_struct *mm) { down_write(&mm->mmap_lock); } static inline void mmap_write_lock_nested(struct mm_struct *mm, int subclass) { down_write_nested(&mm->mmap_lock, subclass); } static inline int mmap_write_lock_killable(struct mm_struct *mm) { return down_write_killable(&mm->mmap_lock); } static inline bool mmap_write_trylock(struct mm_struct *mm) { return down_write_trylock(&mm->mmap_lock) != 0; } static inline void mmap_write_unlock(struct mm_struct *mm) { up_write(&mm->mmap_lock); } static inline void mmap_write_downgrade(struct mm_struct *mm) { downgrade_write(&mm->mmap_lock); } static inline void mmap_read_lock(struct mm_struct *mm) { down_read(&mm->mmap_lock); } static inline int mmap_read_lock_killable(struct mm_struct *mm) { return down_read_killable(&mm->mmap_lock); } static inline bool mmap_read_trylock(struct mm_struct *mm) { return down_read_trylock(&mm->mmap_lock) != 0; } static inline void mmap_read_unlock(struct mm_struct *mm) { up_read(&mm->mmap_lock); } static inline bool mmap_read_trylock_non_owner(struct mm_struct *mm) { if (down_read_trylock(&mm->mmap_lock)) { rwsem_release(&mm->mmap_lock.dep_map, _RET_IP_); return true; } return false; } static inline void mmap_read_unlock_non_owner(struct mm_struct *mm) { up_read_non_owner(&mm->mmap_lock); } static inline void mmap_assert_locked(struct mm_struct *mm) { lockdep_assert_held(&mm->mmap_lock); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); } static inline void mmap_assert_write_locked(struct mm_struct *mm) { lockdep_assert_held_write(&mm->mmap_lock); VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_lock), mm); } static inline int mmap_lock_is_contended(struct mm_struct *mm) { return rwsem_is_contended(&mm->mmap_lock); } #endif /* _LINUX_MMAP_LOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * inet6 interface/address list definitions * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IF_INET6_H #define _NET_IF_INET6_H #include <net/snmp.h> #include <linux/ipv6.h> #include <linux/refcount.h> /* inet6_dev.if_flags */ #define IF_RA_OTHERCONF 0x80 #define IF_RA_MANAGED 0x40 #define IF_RA_RCVD 0x20 #define IF_RS_SENT 0x10 #define IF_READY 0x80000000 /* prefix flags */ #define IF_PREFIX_ONLINK 0x01 #define IF_PREFIX_AUTOCONF 0x02 enum { INET6_IFADDR_STATE_PREDAD, INET6_IFADDR_STATE_DAD, INET6_IFADDR_STATE_POSTDAD, INET6_IFADDR_STATE_ERRDAD, INET6_IFADDR_STATE_DEAD, }; struct inet6_ifaddr { struct in6_addr addr; __u32 prefix_len; __u32 rt_priority; /* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */ __u32 valid_lft; __u32 prefered_lft; refcount_t refcnt; spinlock_t lock; int state; __u32 flags; __u8 dad_probes; __u8 stable_privacy_retry; __u16 scope; __u64 dad_nonce; unsigned long cstamp; /* created timestamp */ unsigned long tstamp; /* updated timestamp */ struct delayed_work dad_work; struct inet6_dev *idev; struct fib6_info *rt; struct hlist_node addr_lst; struct list_head if_list; struct list_head tmp_list; struct inet6_ifaddr *ifpub; int regen_count; bool tokenized; struct rcu_head rcu; struct in6_addr peer_addr; }; struct ip6_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct in6_addr sl_addr[]; }; #define IP6_SFLSIZE(count) (sizeof(struct ip6_sf_socklist) + \ (count) * sizeof(struct in6_addr)) #define IP6_SFBLOCK 10 /* allocate this many at once */ struct ipv6_mc_socklist { struct in6_addr addr; int ifindex; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ipv6_mc_socklist __rcu *next; rwlock_t sflock; struct ip6_sf_socklist *sflist; struct rcu_head rcu; }; struct ip6_sf_list { struct ip6_sf_list *sf_next; struct in6_addr sf_addr; unsigned long sf_count[2]; /* include/exclude counts */ unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ }; #define MAF_TIMER_RUNNING 0x01 #define MAF_LAST_REPORTER 0x02 #define MAF_LOADED 0x04 #define MAF_NOREPORT 0x08 #define MAF_GSQUERY 0x10 struct ifmcaddr6 { struct in6_addr mca_addr; struct inet6_dev *idev; struct ifmcaddr6 *next; struct ip6_sf_list *mca_sources; struct ip6_sf_list *mca_tomb; unsigned int mca_sfmode; unsigned char mca_crcount; unsigned long mca_sfcount[2]; struct timer_list mca_timer; unsigned int mca_flags; int mca_users; refcount_t mca_refcnt; spinlock_t mca_lock; unsigned long mca_cstamp; unsigned long mca_tstamp; }; /* Anycast stuff */ struct ipv6_ac_socklist { struct in6_addr acl_addr; int acl_ifindex; struct ipv6_ac_socklist *acl_next; }; struct ifacaddr6 { struct in6_addr aca_addr; struct fib6_info *aca_rt; struct ifacaddr6 *aca_next; struct hlist_node aca_addr_lst; int aca_users; refcount_t aca_refcnt; unsigned long aca_cstamp; unsigned long aca_tstamp; struct rcu_head rcu; }; #define IFA_HOST IPV6_ADDR_LOOPBACK #define IFA_LINK IPV6_ADDR_LINKLOCAL #define IFA_SITE IPV6_ADDR_SITELOCAL struct ipv6_devstat { struct proc_dir_entry *proc_dir_entry; DEFINE_SNMP_STAT(struct ipstats_mib, ipv6); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6_mib_device, icmpv6dev); DEFINE_SNMP_STAT_ATOMIC(struct icmpv6msg_mib_device, icmpv6msgdev); }; struct inet6_dev { struct net_device *dev; struct list_head addr_list; struct ifmcaddr6 *mc_list; struct ifmcaddr6 *mc_tomb; spinlock_t mc_lock; unsigned char mc_qrv; /* Query Robustness Variable */ unsigned char mc_gq_running; unsigned char mc_ifc_count; unsigned char mc_dad_count; unsigned long mc_v1_seen; /* Max time we stay in MLDv1 mode */ unsigned long mc_qi; /* Query Interval */ unsigned long mc_qri; /* Query Response Interval */ unsigned long mc_maxdelay; struct timer_list mc_gq_timer; /* general query timer */ struct timer_list mc_ifc_timer; /* interface change timer */ struct timer_list mc_dad_timer; /* dad complete mc timer */ struct ifacaddr6 *ac_list; rwlock_t lock; refcount_t refcnt; __u32 if_flags; int dead; u32 desync_factor; struct list_head tempaddr_list; struct in6_addr token; struct neigh_parms *nd_parms; struct ipv6_devconf cnf; struct ipv6_devstat stats; struct timer_list rs_timer; __s32 rs_interval; /* in jiffies */ __u8 rs_probes; unsigned long tstamp; /* ipv6InterfaceTable update timestamp */ struct rcu_head rcu; }; static inline void ipv6_eth_mc_map(const struct in6_addr *addr, char *buf) { /* * +-------+-------+-------+-------+-------+-------+ * | 33 | 33 | DST13 | DST14 | DST15 | DST16 | * +-------+-------+-------+-------+-------+-------+ */ buf[0]= 0x33; buf[1]= 0x33; memcpy(buf + 2, &addr->s6_addr32[3], sizeof(__u32)); } static inline void ipv6_arcnet_mc_map(const struct in6_addr *addr, char *buf) { buf[0] = 0x00; } static inline void ipv6_ib_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { unsigned char scope = broadcast[5] & 0xF; buf[0] = 0; /* Reserved */ buf[1] = 0xff; /* Multicast QPN */ buf[2] = 0xff; buf[3] = 0xff; buf[4] = 0xff; buf[5] = 0x10 | scope; /* scope from broadcast address */ buf[6] = 0x60; /* IPv6 signature */ buf[7] = 0x1b; buf[8] = broadcast[8]; /* P_Key */ buf[9] = broadcast[9]; memcpy(buf + 10, addr->s6_addr + 6, 10); } static inline int ipv6_ipgre_mc_map(const struct in6_addr *addr, const unsigned char *broadcast, char *buf) { if ((broadcast[0] | broadcast[1] | broadcast[2] | broadcast[3]) != 0) { memcpy(buf, broadcast, 4); } else { /* v4mapped? */ if ((addr->s6_addr32[0] | addr->s6_addr32[1] | (addr->s6_addr32[2] ^ htonl(0x0000ffff))) != 0) return -EINVAL; memcpy(buf, &addr->s6_addr32[3], 4); } return 0; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fs.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/iso_fs.h> #include <asm/unaligned.h> enum isofs_file_format { isofs_file_normal = 0, isofs_file_sparse = 1, isofs_file_compressed = 2, }; /* * iso fs inode data in memory */ struct iso_inode_info { unsigned long i_iget5_block; unsigned long i_iget5_offset; unsigned int i_first_extent; unsigned char i_file_format; unsigned char i_format_parm[3]; unsigned long i_next_section_block; unsigned long i_next_section_offset; off_t i_section_size; struct inode vfs_inode; }; /* * iso9660 super-block data in memory */ struct isofs_sb_info { unsigned long s_ninodes; unsigned long s_nzones; unsigned long s_firstdatazone; unsigned long s_log_zone_size; unsigned long s_max_size; int s_rock_offset; /* offset of SUSP fields within SU area */ s32 s_sbsector; unsigned char s_joliet_level; unsigned char s_mapping; unsigned char s_check; unsigned char s_session; unsigned int s_high_sierra:1; unsigned int s_rock:2; unsigned int s_cruft:1; /* Broken disks with high byte of length * containing junk */ unsigned int s_nocompress:1; unsigned int s_hide:1; unsigned int s_showassoc:1; unsigned int s_overriderockperm:1; unsigned int s_uid_set:1; unsigned int s_gid_set:1; umode_t s_fmode; umode_t s_dmode; kgid_t s_gid; kuid_t s_uid; struct nls_table *s_nls_iocharset; /* Native language support table */ }; #define ISOFS_INVALID_MODE ((umode_t) -1) static inline struct isofs_sb_info *ISOFS_SB(struct super_block *sb) { return sb->s_fs_info; } static inline struct iso_inode_info *ISOFS_I(struct inode *inode) { return container_of(inode, struct iso_inode_info, vfs_inode); } static inline int isonum_711(u8 *p) { return *p; } static inline int isonum_712(s8 *p) { return *p; } static inline unsigned int isonum_721(u8 *p) { return get_unaligned_le16(p); } static inline unsigned int isonum_722(u8 *p) { return get_unaligned_be16(p); } static inline unsigned int isonum_723(u8 *p) { /* Ignore bigendian datum due to broken mastering programs */ return get_unaligned_le16(p); } static inline unsigned int isonum_731(u8 *p) { return get_unaligned_le32(p); } static inline unsigned int isonum_732(u8 *p) { return get_unaligned_be32(p); } static inline unsigned int isonum_733(u8 *p) { /* Ignore bigendian datum due to broken mastering programs */ return get_unaligned_le32(p); } extern int iso_date(u8 *, int); struct inode; /* To make gcc happy */ extern int parse_rock_ridge_inode(struct iso_directory_record *, struct inode *, int relocated); extern int get_rock_ridge_filename(struct iso_directory_record *, char *, struct inode *); extern int isofs_name_translate(struct iso_directory_record *, char *, struct inode *); int get_joliet_filename(struct iso_directory_record *, unsigned char *, struct inode *); int get_acorn_filename(struct iso_directory_record *, char *, struct inode *); extern struct dentry *isofs_lookup(struct inode *, struct dentry *, unsigned int flags); extern struct buffer_head *isofs_bread(struct inode *, sector_t); extern int isofs_get_blocks(struct inode *, sector_t, struct buffer_head **, unsigned long); struct inode *__isofs_iget(struct super_block *sb, unsigned long block, unsigned long offset, int relocated); static inline struct inode *isofs_iget(struct super_block *sb, unsigned long block, unsigned long offset) { return __isofs_iget(sb, block, offset, 0); } static inline struct inode *isofs_iget_reloc(struct super_block *sb, unsigned long block, unsigned long offset) { return __isofs_iget(sb, block, offset, 1); } /* Because the inode number is no longer relevant to finding the * underlying meta-data for an inode, we are free to choose a more * convenient 32-bit number as the inode number. The inode numbering * scheme was recommended by Sergey Vlasov and Eric Lammerts. */ static inline unsigned long isofs_get_ino(unsigned long block, unsigned long offset, unsigned long bufbits) { return (block << (bufbits - 5)) | (offset >> 5); } /* Every directory can have many redundant directory entries scattered * throughout the directory tree. First there is the directory entry * with the name of the directory stored in the parent directory. * Then, there is the "." directory entry stored in the directory * itself. Finally, there are possibly many ".." directory entries * stored in all the subdirectories. * * In order for the NFS get_parent() method to work and for the * general consistency of the dcache, we need to make sure the * "i_iget5_block" and "i_iget5_offset" all point to exactly one of * the many redundant entries for each directory. We normalize the * block and offset by always making them point to the "." directory. * * Notice that we do not use the entry for the directory with the name * that is located in the parent directory. Even though choosing this * first directory is more natural, it is much easier to find the "." * entry in the NFS get_parent() method because it is implicitly * encoded in the "extent + ext_attr_length" fields of _all_ the * redundant entries for the directory. Thus, it can always be * reached regardless of which directory entry you have in hand. * * This works because the "." entry is simply the first directory * record when you start reading the file that holds all the directory * records, and this file starts at "extent + ext_attr_length" blocks. * Because the "." entry is always the first entry listed in the * directories file, the normalized "offset" value is always 0. * * You should pass the directory entry in "de". On return, "block" * and "offset" will hold normalized values. Only directories are * affected making it safe to call even for non-directory file * types. */ static inline void isofs_normalize_block_and_offset(struct iso_directory_record* de, unsigned long *block, unsigned long *offset) { /* Only directories are normalized. */ if (de->flags[0] & 2) { *offset = 0; *block = (unsigned long)isonum_733(de->extent) + (unsigned long)isonum_711(de->ext_attr_length); } } extern const struct inode_operations isofs_dir_inode_operations; extern const struct file_operations isofs_dir_operations; extern const struct address_space_operations isofs_symlink_aops; extern const struct export_operations isofs_export_ops;
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 /* * Implementation of the access vector table type. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> * * Added conditional policy language extensions * * Copyright (C) 2003 Tresys Technology, LLC * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, version 2. * * Updated: Yuichi Nakamura <ynakam@hitachisoft.jp> * Tuned number of hash slots for avtab to reduce memory usage */ #include <linux/kernel.h> #include <linux/slab.h> #include <linux/errno.h> #include "avtab.h" #include "policydb.h" static struct kmem_cache *avtab_node_cachep; static struct kmem_cache *avtab_xperms_cachep; /* Based on MurmurHash3, written by Austin Appleby and placed in the * public domain. */ static inline int avtab_hash(struct avtab_key *keyp, u32 mask) { static const u32 c1 = 0xcc9e2d51; static const u32 c2 = 0x1b873593; static const u32 r1 = 15; static const u32 r2 = 13; static const u32 m = 5; static const u32 n = 0xe6546b64; u32 hash = 0; #define mix(input) { \ u32 v = input; \ v *= c1; \ v = (v << r1) | (v >> (32 - r1)); \ v *= c2; \ hash ^= v; \ hash = (hash << r2) | (hash >> (32 - r2)); \ hash = hash * m + n; \ } mix(keyp->target_class); mix(keyp->target_type); mix(keyp->source_type); #undef mix hash ^= hash >> 16; hash *= 0x85ebca6b; hash ^= hash >> 13; hash *= 0xc2b2ae35; hash ^= hash >> 16; return hash & mask; } static struct avtab_node* avtab_insert_node(struct avtab *h, int hvalue, struct avtab_node *prev, struct avtab_node *cur, struct avtab_key *key, struct avtab_datum *datum) { struct avtab_node *newnode; struct avtab_extended_perms *xperms; newnode = kmem_cache_zalloc(avtab_node_cachep, GFP_KERNEL); if (newnode == NULL) return NULL; newnode->key = *key; if (key->specified & AVTAB_XPERMS) { xperms = kmem_cache_zalloc(avtab_xperms_cachep, GFP_KERNEL); if (xperms == NULL) { kmem_cache_free(avtab_node_cachep, newnode); return NULL; } *xperms = *(datum->u.xperms); newnode->datum.u.xperms = xperms; } else { newnode->datum.u.data = datum->u.data; } if (prev) { newnode->next = prev->next; prev->next = newnode; } else { struct avtab_node **n = &h->htable[hvalue]; newnode->next = *n; *n = newnode; } h->nel++; return newnode; } static int avtab_insert(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum) { int hvalue; struct avtab_node *prev, *cur, *newnode; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if (!h || !h->nslot) return -EINVAL; hvalue = avtab_hash(key, h->mask); for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified)) { /* extended perms may not be unique */ if (specified & AVTAB_XPERMS) break; return -EEXIST; } if (key->source_type < cur->key.source_type) break; if (key->source_type == cur->key.source_type && key->target_type < cur->key.target_type) break; if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class) break; } newnode = avtab_insert_node(h, hvalue, prev, cur, key, datum); if (!newnode) return -ENOMEM; return 0; } /* Unlike avtab_insert(), this function allow multiple insertions of the same * key/specified mask into the table, as needed by the conditional avtab. * It also returns a pointer to the node inserted. */ struct avtab_node * avtab_insert_nonunique(struct avtab *h, struct avtab_key *key, struct avtab_datum *datum) { int hvalue; struct avtab_node *prev, *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if (!h || !h->nslot) return NULL; hvalue = avtab_hash(key, h->mask); for (prev = NULL, cur = h->htable[hvalue]; cur; prev = cur, cur = cur->next) { if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified)) break; if (key->source_type < cur->key.source_type) break; if (key->source_type == cur->key.source_type && key->target_type < cur->key.target_type) break; if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class) break; } return avtab_insert_node(h, hvalue, prev, cur, key, datum); } struct avtab_datum *avtab_search(struct avtab *h, struct avtab_key *key) { int hvalue; struct avtab_node *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if (!h || !h->nslot) return NULL; hvalue = avtab_hash(key, h->mask); for (cur = h->htable[hvalue]; cur; cur = cur->next) { if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified)) return &cur->datum; if (key->source_type < cur->key.source_type) break; if (key->source_type == cur->key.source_type && key->target_type < cur->key.target_type) break; if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class) break; } return NULL; } /* This search function returns a node pointer, and can be used in * conjunction with avtab_search_next_node() */ struct avtab_node* avtab_search_node(struct avtab *h, struct avtab_key *key) { int hvalue; struct avtab_node *cur; u16 specified = key->specified & ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); if (!h || !h->nslot) return NULL; hvalue = avtab_hash(key, h->mask); for (cur = h->htable[hvalue]; cur; cur = cur->next) { if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class == cur->key.target_class && (specified & cur->key.specified)) return cur; if (key->source_type < cur->key.source_type) break; if (key->source_type == cur->key.source_type && key->target_type < cur->key.target_type) break; if (key->source_type == cur->key.source_type && key->target_type == cur->key.target_type && key->target_class < cur->key.target_class) break; } return NULL; } struct avtab_node* avtab_search_node_next(struct avtab_node *node, int specified) { struct avtab_node *cur; if (!node) return NULL; specified &= ~(AVTAB_ENABLED|AVTAB_ENABLED_OLD); for (cur = node->next; cur; cur = cur->next) { if (node->key.source_type == cur->key.source_type && node->key.target_type == cur->key.target_type && node->key.target_class == cur->key.target_class && (specified & cur->key.specified)) return cur; if (node->key.source_type < cur->key.source_type) break; if (node->key.source_type == cur->key.source_type && node->key.target_type < cur->key.target_type) break; if (node->key.source_type == cur->key.source_type && node->key.target_type == cur->key.target_type && node->key.target_class < cur->key.target_class) break; } return NULL; } void avtab_destroy(struct avtab *h) { int i; struct avtab_node *cur, *temp; if (!h) return; for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; while (cur) { temp = cur; cur = cur->next; if (temp->key.specified & AVTAB_XPERMS) kmem_cache_free(avtab_xperms_cachep, temp->datum.u.xperms); kmem_cache_free(avtab_node_cachep, temp); } } kvfree(h->htable); h->htable = NULL; h->nel = 0; h->nslot = 0; h->mask = 0; } void avtab_init(struct avtab *h) { h->htable = NULL; h->nel = 0; h->nslot = 0; h->mask = 0; } static int avtab_alloc_common(struct avtab *h, u32 nslot) { if (!nslot) return 0; h->htable = kvcalloc(nslot, sizeof(void *), GFP_KERNEL); if (!h->htable) return -ENOMEM; h->nslot = nslot; h->mask = nslot - 1; return 0; } int avtab_alloc(struct avtab *h, u32 nrules) { int rc; u32 nslot = 0; if (nrules != 0) { u32 shift = 1; u32 work = nrules >> 3; while (work) { work >>= 1; shift++; } nslot = 1 << shift; if (nslot > MAX_AVTAB_HASH_BUCKETS) nslot = MAX_AVTAB_HASH_BUCKETS; rc = avtab_alloc_common(h, nslot); if (rc) return rc; } pr_debug("SELinux: %d avtab hash slots, %d rules.\n", nslot, nrules); return 0; } int avtab_alloc_dup(struct avtab *new, const struct avtab *orig) { return avtab_alloc_common(new, orig->nslot); } void avtab_hash_eval(struct avtab *h, char *tag) { int i, chain_len, slots_used, max_chain_len; unsigned long long chain2_len_sum; struct avtab_node *cur; slots_used = 0; max_chain_len = 0; chain2_len_sum = 0; for (i = 0; i < h->nslot; i++) { cur = h->htable[i]; if (cur) { slots_used++; chain_len = 0; while (cur) { chain_len++; cur = cur->next; } if (chain_len > max_chain_len) max_chain_len = chain_len; chain2_len_sum += chain_len * chain_len; } } pr_debug("SELinux: %s: %d entries and %d/%d buckets used, " "longest chain length %d sum of chain length^2 %llu\n", tag, h->nel, slots_used, h->nslot, max_chain_len, chain2_len_sum); } static uint16_t spec_order[] = { AVTAB_ALLOWED, AVTAB_AUDITDENY, AVTAB_AUDITALLOW, AVTAB_TRANSITION, AVTAB_CHANGE, AVTAB_MEMBER, AVTAB_XPERMS_ALLOWED, AVTAB_XPERMS_AUDITALLOW, AVTAB_XPERMS_DONTAUDIT }; int avtab_read_item(struct avtab *a, void *fp, struct policydb *pol, int (*insertf)(struct avtab *a, struct avtab_key *k, struct avtab_datum *d, void *p), void *p) { __le16 buf16[4]; u16 enabled; u32 items, items2, val, vers = pol->policyvers; struct avtab_key key; struct avtab_datum datum; struct avtab_extended_perms xperms; __le32 buf32[ARRAY_SIZE(xperms.perms.p)]; int i, rc; unsigned set; memset(&key, 0, sizeof(struct avtab_key)); memset(&datum, 0, sizeof(struct avtab_datum)); if (vers < POLICYDB_VERSION_AVTAB) { rc = next_entry(buf32, fp, sizeof(u32)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items2 = le32_to_cpu(buf32[0]); if (items2 > ARRAY_SIZE(buf32)) { pr_err("SELinux: avtab: entry overflow\n"); return -EINVAL; } rc = next_entry(buf32, fp, sizeof(u32)*items2); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items = 0; val = le32_to_cpu(buf32[items++]); key.source_type = (u16)val; if (key.source_type != val) { pr_err("SELinux: avtab: truncated source type\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); key.target_type = (u16)val; if (key.target_type != val) { pr_err("SELinux: avtab: truncated target type\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); key.target_class = (u16)val; if (key.target_class != val) { pr_err("SELinux: avtab: truncated target class\n"); return -EINVAL; } val = le32_to_cpu(buf32[items++]); enabled = (val & AVTAB_ENABLED_OLD) ? AVTAB_ENABLED : 0; if (!(val & (AVTAB_AV | AVTAB_TYPE))) { pr_err("SELinux: avtab: null entry\n"); return -EINVAL; } if ((val & AVTAB_AV) && (val & AVTAB_TYPE)) { pr_err("SELinux: avtab: entry has both access vectors and types\n"); return -EINVAL; } if (val & AVTAB_XPERMS) { pr_err("SELinux: avtab: entry has extended permissions\n"); return -EINVAL; } for (i = 0; i < ARRAY_SIZE(spec_order); i++) { if (val & spec_order[i]) { key.specified = spec_order[i] | enabled; datum.u.data = le32_to_cpu(buf32[items++]); rc = insertf(a, &key, &datum, p); if (rc) return rc; } } if (items != items2) { pr_err("SELinux: avtab: entry only had %d items, expected %d\n", items2, items); return -EINVAL; } return 0; } rc = next_entry(buf16, fp, sizeof(u16)*4); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } items = 0; key.source_type = le16_to_cpu(buf16[items++]); key.target_type = le16_to_cpu(buf16[items++]); key.target_class = le16_to_cpu(buf16[items++]); key.specified = le16_to_cpu(buf16[items++]); if (!policydb_type_isvalid(pol, key.source_type) || !policydb_type_isvalid(pol, key.target_type) || !policydb_class_isvalid(pol, key.target_class)) { pr_err("SELinux: avtab: invalid type or class\n"); return -EINVAL; } set = 0; for (i = 0; i < ARRAY_SIZE(spec_order); i++) { if (key.specified & spec_order[i]) set++; } if (!set || set > 1) { pr_err("SELinux: avtab: more than one specifier\n"); return -EINVAL; } if ((vers < POLICYDB_VERSION_XPERMS_IOCTL) && (key.specified & AVTAB_XPERMS)) { pr_err("SELinux: avtab: policy version %u does not " "support extended permissions rules and one " "was specified\n", vers); return -EINVAL; } else if (key.specified & AVTAB_XPERMS) { memset(&xperms, 0, sizeof(struct avtab_extended_perms)); rc = next_entry(&xperms.specified, fp, sizeof(u8)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } rc = next_entry(&xperms.driver, fp, sizeof(u8)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } rc = next_entry(buf32, fp, sizeof(u32)*ARRAY_SIZE(xperms.perms.p)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } for (i = 0; i < ARRAY_SIZE(xperms.perms.p); i++) xperms.perms.p[i] = le32_to_cpu(buf32[i]); datum.u.xperms = &xperms; } else { rc = next_entry(buf32, fp, sizeof(u32)); if (rc) { pr_err("SELinux: avtab: truncated entry\n"); return rc; } datum.u.data = le32_to_cpu(*buf32); } if ((key.specified & AVTAB_TYPE) && !policydb_type_isvalid(pol, datum.u.data)) { pr_err("SELinux: avtab: invalid type\n"); return -EINVAL; } return insertf(a, &key, &datum, p); } static int avtab_insertf(struct avtab *a, struct avtab_key *k, struct avtab_datum *d, void *p) { return avtab_insert(a, k, d); } int avtab_read(struct avtab *a, void *fp, struct policydb *pol) { int rc; __le32 buf[1]; u32 nel, i; rc = next_entry(buf, fp, sizeof(u32)); if (rc < 0) { pr_err("SELinux: avtab: truncated table\n"); goto bad; } nel = le32_to_cpu(buf[0]); if (!nel) { pr_err("SELinux: avtab: table is empty\n"); rc = -EINVAL; goto bad; } rc = avtab_alloc(a, nel); if (rc) goto bad; for (i = 0; i < nel; i++) { rc = avtab_read_item(a, fp, pol, avtab_insertf, NULL); if (rc) { if (rc == -ENOMEM) pr_err("SELinux: avtab: out of memory\n"); else if (rc == -EEXIST) pr_err("SELinux: avtab: duplicate entry\n"); goto bad; } } rc = 0; out: return rc; bad: avtab_destroy(a); goto out; } int avtab_write_item(struct policydb *p, struct avtab_node *cur, void *fp) { __le16 buf16[4]; __le32 buf32[ARRAY_SIZE(cur->datum.u.xperms->perms.p)]; int rc; unsigned int i; buf16[0] = cpu_to_le16(cur->key.source_type); buf16[1] = cpu_to_le16(cur->key.target_type); buf16[2] = cpu_to_le16(cur->key.target_class); buf16[3] = cpu_to_le16(cur->key.specified); rc = put_entry(buf16, sizeof(u16), 4, fp); if (rc) return rc; if (cur->key.specified & AVTAB_XPERMS) { rc = put_entry(&cur->datum.u.xperms->specified, sizeof(u8), 1, fp); if (rc) return rc; rc = put_entry(&cur->datum.u.xperms->driver, sizeof(u8), 1, fp); if (rc) return rc; for (i = 0; i < ARRAY_SIZE(cur->datum.u.xperms->perms.p); i++) buf32[i] = cpu_to_le32(cur->datum.u.xperms->perms.p[i]); rc = put_entry(buf32, sizeof(u32), ARRAY_SIZE(cur->datum.u.xperms->perms.p), fp); } else { buf32[0] = cpu_to_le32(cur->datum.u.data); rc = put_entry(buf32, sizeof(u32), 1, fp); } if (rc) return rc; return 0; } int avtab_write(struct policydb *p, struct avtab *a, void *fp) { unsigned int i; int rc = 0; struct avtab_node *cur; __le32 buf[1]; buf[0] = cpu_to_le32(a->nel); rc = put_entry(buf, sizeof(u32), 1, fp); if (rc) return rc; for (i = 0; i < a->nslot; i++) { for (cur = a->htable[i]; cur; cur = cur->next) { rc = avtab_write_item(p, cur, fp); if (rc) return rc; } } return rc; } void __init avtab_cache_init(void) { avtab_node_cachep = kmem_cache_create("avtab_node", sizeof(struct avtab_node), 0, SLAB_PANIC, NULL); avtab_xperms_cachep = kmem_cache_create("avtab_extended_perms", sizeof(struct avtab_extended_perms), 0, SLAB_PANIC, NULL); }
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PERCPU_COUNTER_H #define _LINUX_PERCPU_COUNTER_H /* * A simple "approximate counter" for use in ext2 and ext3 superblocks. * * WARNING: these things are HUGE. 4 kbytes per counter on 32-way P4. */ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/list.h> #include <linux/threads.h> #include <linux/percpu.h> #include <linux/types.h> #include <linux/gfp.h> #ifdef CONFIG_SMP struct percpu_counter { raw_spinlock_t lock; s64 count; #ifdef CONFIG_HOTPLUG_CPU struct list_head list; /* All percpu_counters are on a list */ #endif s32 __percpu *counters; }; extern int percpu_counter_batch; int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key); #define percpu_counter_init(fbc, value, gfp) \ ({ \ static struct lock_class_key __key; \ \ __percpu_counter_init(fbc, value, gfp, &__key); \ }) void percpu_counter_destroy(struct percpu_counter *fbc); void percpu_counter_set(struct percpu_counter *fbc, s64 amount); void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch); s64 __percpu_counter_sum(struct percpu_counter *fbc); int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch); void percpu_counter_sync(struct percpu_counter *fbc); static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { return __percpu_counter_compare(fbc, rhs, percpu_counter_batch); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { percpu_counter_add_batch(fbc, amount, percpu_counter_batch); } static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc) { s64 ret = __percpu_counter_sum(fbc); return ret < 0 ? 0 : ret; } static inline s64 percpu_counter_sum(struct percpu_counter *fbc) { return __percpu_counter_sum(fbc); } static inline s64 percpu_counter_read(struct percpu_counter *fbc) { return fbc->count; } /* * It is possible for the percpu_counter_read() to return a small negative * number for some counter which should never be negative. * */ static inline s64 percpu_counter_read_positive(struct percpu_counter *fbc) { /* Prevent reloads of fbc->count */ s64 ret = READ_ONCE(fbc->count); if (ret >= 0) return ret; return 0; } static inline bool percpu_counter_initialized(struct percpu_counter *fbc) { return (fbc->counters != NULL); } #else /* !CONFIG_SMP */ struct percpu_counter { s64 count; }; static inline int percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp) { fbc->count = amount; return 0; } static inline void percpu_counter_destroy(struct percpu_counter *fbc) { } static inline void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { fbc->count = amount; } static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs) { if (fbc->count > rhs) return 1; else if (fbc->count < rhs) return -1; else return 0; } static inline int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { return percpu_counter_compare(fbc, rhs); } static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount) { preempt_disable(); fbc->count += amount; preempt_enable(); } static inline void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { percpu_counter_add(fbc, amount); }