1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Checksumming functions for IP, TCP, UDP and so on * * Authors: Jorge Cwik, <jorge@laser.satlink.net> * Arnt Gulbrandsen, <agulbra@nvg.unit.no> * Borrows very liberally from tcp.c and ip.c, see those * files for more names. */ #ifndef _CHECKSUM_H #define _CHECKSUM_H #include <linux/errno.h> #include <asm/types.h> #include <asm/byteorder.h> #include <linux/uaccess.h> #include <asm/checksum.h> #ifndef _HAVE_ARCH_COPY_AND_CSUM_FROM_USER static inline __wsum csum_and_copy_from_user (const void __user *src, void *dst, int len) { if (copy_from_user(dst, src, len)) return 0; return csum_partial(dst, len, ~0U); } #endif #ifndef HAVE_CSUM_COPY_USER static __inline__ __wsum csum_and_copy_to_user (const void *src, void __user *dst, int len) { __wsum sum = csum_partial(src, len, ~0U); if (copy_to_user(dst, src, len) == 0) return sum; return 0; } #endif #ifndef _HAVE_ARCH_CSUM_AND_COPY static inline __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len) { memcpy(dst, src, len); return csum_partial(dst, len, 0); } #endif #ifndef HAVE_ARCH_CSUM_ADD static inline __wsum csum_add(__wsum csum, __wsum addend) { u32 res = (__force u32)csum; res += (__force u32)addend; return (__force __wsum)(res + (res < (__force u32)addend)); } #endif static inline __wsum csum_sub(__wsum csum, __wsum addend) { return csum_add(csum, ~addend); } static inline __sum16 csum16_add(__sum16 csum, __be16 addend) { u16 res = (__force u16)csum; res += (__force u16)addend; return (__force __sum16)(res + (res < (__force u16)addend)); } static inline __sum16 csum16_sub(__sum16 csum, __be16 addend) { return csum16_add(csum, ~addend); } static inline __wsum csum_block_add(__wsum csum, __wsum csum2, int offset) { u32 sum = (__force u32)csum2; /* rotate sum to align it with a 16b boundary */ if (offset & 1) sum = ror32(sum, 8); return csum_add(csum, (__force __wsum)sum); } static inline __wsum csum_block_add_ext(__wsum csum, __wsum csum2, int offset, int len) { return csum_block_add(csum, csum2, offset); } static inline __wsum csum_block_sub(__wsum csum, __wsum csum2, int offset) { return csum_block_add(csum, ~csum2, offset); } static inline __wsum csum_unfold(__sum16 n) { return (__force __wsum)n; } static inline __wsum csum_partial_ext(const void *buff, int len, __wsum sum) { return csum_partial(buff, len, sum); } #define CSUM_MANGLED_0 ((__force __sum16)0xffff) static inline void csum_replace_by_diff(__sum16 *sum, __wsum diff) { *sum = csum_fold(csum_add(diff, ~csum_unfold(*sum))); } static inline void csum_replace4(__sum16 *sum, __be32 from, __be32 to) { __wsum tmp = csum_sub(~csum_unfold(*sum), (__force __wsum)from); *sum = csum_fold(csum_add(tmp, (__force __wsum)to)); } /* Implements RFC 1624 (Incremental Internet Checksum) * 3. Discussion states : * HC' = ~(~HC + ~m + m') * m : old value of a 16bit field * m' : new value of a 16bit field */ static inline void csum_replace2(__sum16 *sum, __be16 old, __be16 new) { *sum = ~csum16_add(csum16_sub(~(*sum), old), new); } struct sk_buff; void inet_proto_csum_replace4(__sum16 *sum, struct sk_buff *skb, __be32 from, __be32 to, bool pseudohdr); void inet_proto_csum_replace16(__sum16 *sum, struct sk_buff *skb, const __be32 *from, const __be32 *to, bool pseudohdr); void inet_proto_csum_replace_by_diff(__sum16 *sum, struct sk_buff *skb, __wsum diff, bool pseudohdr); static inline void inet_proto_csum_replace2(__sum16 *sum, struct sk_buff *skb, __be16 from, __be16 to, bool pseudohdr) { inet_proto_csum_replace4(sum, skb, (__force __be32)from, (__force __be32)to, pseudohdr); } static inline __wsum remcsum_adjust(void *ptr, __wsum csum, int start, int offset) { __sum16 *psum = (__sum16 *)(ptr + offset); __wsum delta; /* Subtract out checksum up to start */ csum = csum_sub(csum, csum_partial(ptr, start, 0)); /* Set derived checksum in packet */ delta = csum_sub((__force __wsum)csum_fold(csum), (__force __wsum)*psum); *psum = csum_fold(csum); return delta; } static inline void remcsum_unadjust(__sum16 *psum, __wsum delta) { *psum = csum_fold(csum_sub(delta, (__force __wsum)*psum)); } #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 /* SPDX-License-Identifier: GPL-2.0 */ /* * Written by Mark Hemment, 1996 (markhe@nextd.demon.co.uk). * * (C) SGI 2006, Christoph Lameter * Cleaned up and restructured to ease the addition of alternative * implementations of SLAB allocators. * (C) Linux Foundation 2008-2013 * Unified interface for all slab allocators */ #ifndef _LINUX_SLAB_H #define _LINUX_SLAB_H #include <linux/gfp.h> #include <linux/overflow.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/percpu-refcount.h> /* * Flags to pass to kmem_cache_create(). * The ones marked DEBUG are only valid if CONFIG_DEBUG_SLAB is set. */ /* DEBUG: Perform (expensive) checks on alloc/free */ #define SLAB_CONSISTENCY_CHECKS ((slab_flags_t __force)0x00000100U) /* DEBUG: Red zone objs in a cache */ #define SLAB_RED_ZONE ((slab_flags_t __force)0x00000400U) /* DEBUG: Poison objects */ #define SLAB_POISON ((slab_flags_t __force)0x00000800U) /* Align objs on cache lines */ #define SLAB_HWCACHE_ALIGN ((slab_flags_t __force)0x00002000U) /* Use GFP_DMA memory */ #define SLAB_CACHE_DMA ((slab_flags_t __force)0x00004000U) /* Use GFP_DMA32 memory */ #define SLAB_CACHE_DMA32 ((slab_flags_t __force)0x00008000U) /* DEBUG: Store the last owner for bug hunting */ #define SLAB_STORE_USER ((slab_flags_t __force)0x00010000U) /* Panic if kmem_cache_create() fails */ #define SLAB_PANIC ((slab_flags_t __force)0x00040000U) /* * SLAB_TYPESAFE_BY_RCU - **WARNING** READ THIS! * * This delays freeing the SLAB page by a grace period, it does _NOT_ * delay object freeing. This means that if you do kmem_cache_free() * that memory location is free to be reused at any time. Thus it may * be possible to see another object there in the same RCU grace period. * * This feature only ensures the memory location backing the object * stays valid, the trick to using this is relying on an independent * object validation pass. Something like: * * rcu_read_lock() * again: * obj = lockless_lookup(key); * if (obj) { * if (!try_get_ref(obj)) // might fail for free objects * goto again; * * if (obj->key != key) { // not the object we expected * put_ref(obj); * goto again; * } * } * rcu_read_unlock(); * * This is useful if we need to approach a kernel structure obliquely, * from its address obtained without the usual locking. We can lock * the structure to stabilize it and check it's still at the given address, * only if we can be sure that the memory has not been meanwhile reused * for some other kind of object (which our subsystem's lock might corrupt). * * rcu_read_lock before reading the address, then rcu_read_unlock after * taking the spinlock within the structure expected at that address. * * Note that SLAB_TYPESAFE_BY_RCU was originally named SLAB_DESTROY_BY_RCU. */ /* Defer freeing slabs to RCU */ #define SLAB_TYPESAFE_BY_RCU ((slab_flags_t __force)0x00080000U) /* Spread some memory over cpuset */ #define SLAB_MEM_SPREAD ((slab_flags_t __force)0x00100000U) /* Trace allocations and frees */ #define SLAB_TRACE ((slab_flags_t __force)0x00200000U) /* Flag to prevent checks on free */ #ifdef CONFIG_DEBUG_OBJECTS # define SLAB_DEBUG_OBJECTS ((slab_flags_t __force)0x00400000U) #else # define SLAB_DEBUG_OBJECTS 0 #endif /* Avoid kmemleak tracing */ #define SLAB_NOLEAKTRACE ((slab_flags_t __force)0x00800000U) /* Fault injection mark */ #ifdef CONFIG_FAILSLAB # define SLAB_FAILSLAB ((slab_flags_t __force)0x02000000U) #else # define SLAB_FAILSLAB 0 #endif /* Account to memcg */ #ifdef CONFIG_MEMCG_KMEM # define SLAB_ACCOUNT ((slab_flags_t __force)0x04000000U) #else # define SLAB_ACCOUNT 0 #endif #ifdef CONFIG_KASAN #define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else #define SLAB_KASAN 0 #endif /* The following flags affect the page allocator grouping pages by mobility */ /* Objects are reclaimable */ #define SLAB_RECLAIM_ACCOUNT ((slab_flags_t __force)0x00020000U) #define SLAB_TEMPORARY SLAB_RECLAIM_ACCOUNT /* Objects are short-lived */ /* Slab deactivation flag */ #define SLAB_DEACTIVATED ((slab_flags_t __force)0x10000000U) /* * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests. * * Dereferencing ZERO_SIZE_PTR will lead to a distinct access fault. * * ZERO_SIZE_PTR can be passed to kfree though in the same way that NULL can. * Both make kfree a no-op. */ #define ZERO_SIZE_PTR ((void *)16) #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \ (unsigned long)ZERO_SIZE_PTR) #include <linux/kasan.h> struct mem_cgroup; /* * struct kmem_cache related prototypes */ void __init kmem_cache_init(void); bool slab_is_available(void); extern bool usercopy_fallback; struct kmem_cache *kmem_cache_create(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, void (*ctor)(void *)); struct kmem_cache *kmem_cache_create_usercopy(const char *name, unsigned int size, unsigned int align, slab_flags_t flags, unsigned int useroffset, unsigned int usersize, void (*ctor)(void *)); void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); /* * Please use this macro to create slab caches. Simply specify the * name of the structure and maybe some flags that are listed above. * * The alignment of the struct determines object alignment. If you * f.e. add ____cacheline_aligned_in_smp to the struct declaration * then the objects will be properly aligned in SMP configurations. */ #define KMEM_CACHE(__struct, __flags) \ kmem_cache_create(#__struct, sizeof(struct __struct), \ __alignof__(struct __struct), (__flags), NULL) /* * To whitelist a single field for copying to/from usercopy, use this * macro instead for KMEM_CACHE() above. */ #define KMEM_CACHE_USERCOPY(__struct, __flags, __field) \ kmem_cache_create_usercopy(#__struct, \ sizeof(struct __struct), \ __alignof__(struct __struct), (__flags), \ offsetof(struct __struct, __field), \ sizeof_field(struct __struct, __field), NULL) /* * Common kmalloc functions provided by all allocators */ void * __must_check krealloc(const void *, size_t, gfp_t); void kfree(const void *); void kfree_sensitive(const void *); size_t __ksize(const void *); size_t ksize(const void *); #ifdef CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR void __check_heap_object(const void *ptr, unsigned long n, struct page *page, bool to_user); #else static inline void __check_heap_object(const void *ptr, unsigned long n, struct page *page, bool to_user) { } #endif /* * Some archs want to perform DMA into kmalloc caches and need a guaranteed * alignment larger than the alignment of a 64-bit integer. * Setting ARCH_KMALLOC_MINALIGN in arch headers allows that. */ #if defined(ARCH_DMA_MINALIGN) && ARCH_DMA_MINALIGN > 8 #define ARCH_KMALLOC_MINALIGN ARCH_DMA_MINALIGN #define KMALLOC_MIN_SIZE ARCH_DMA_MINALIGN #define KMALLOC_SHIFT_LOW ilog2(ARCH_DMA_MINALIGN) #else #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) #endif /* * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment. * Intended for arches that get misalignment faults even for 64 bit integer * aligned buffers. */ #ifndef ARCH_SLAB_MINALIGN #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) #endif /* * kmalloc and friends return ARCH_KMALLOC_MINALIGN aligned * pointers. kmem_cache_alloc and friends return ARCH_SLAB_MINALIGN * aligned pointers. */ #define __assume_kmalloc_alignment __assume_aligned(ARCH_KMALLOC_MINALIGN) #define __assume_slab_alignment __assume_aligned(ARCH_SLAB_MINALIGN) #define __assume_page_alignment __assume_aligned(PAGE_SIZE) /* * Kmalloc array related definitions */ #ifdef CONFIG_SLAB /* * The largest kmalloc size supported by the SLAB allocators is * 32 megabyte (2^25) or the maximum allocatable page order if that is * less than 32 MB. * * WARNING: Its not easy to increase this value since the allocators have * to do various tricks to work around compiler limitations in order to * ensure proper constant folding. */ #define KMALLOC_SHIFT_HIGH ((MAX_ORDER + PAGE_SHIFT - 1) <= 25 ? \ (MAX_ORDER + PAGE_SHIFT - 1) : 25) #define KMALLOC_SHIFT_MAX KMALLOC_SHIFT_HIGH #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 5 #endif #endif #ifdef CONFIG_SLUB /* * SLUB directly allocates requests fitting in to an order-1 page * (PAGE_SIZE*2). Larger requests are passed to the page allocator. */ #define KMALLOC_SHIFT_HIGH (PAGE_SHIFT + 1) #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif #endif #ifdef CONFIG_SLOB /* * SLOB passes all requests larger than one page to the page allocator. * No kmalloc array is necessary since objects of different sizes can * be allocated from the same page. */ #define KMALLOC_SHIFT_HIGH PAGE_SHIFT #define KMALLOC_SHIFT_MAX (MAX_ORDER + PAGE_SHIFT - 1) #ifndef KMALLOC_SHIFT_LOW #define KMALLOC_SHIFT_LOW 3 #endif #endif /* Maximum allocatable size */ #define KMALLOC_MAX_SIZE (1UL << KMALLOC_SHIFT_MAX) /* Maximum size for which we actually use a slab cache */ #define KMALLOC_MAX_CACHE_SIZE (1UL << KMALLOC_SHIFT_HIGH) /* Maximum order allocatable via the slab allocator */ #define KMALLOC_MAX_ORDER (KMALLOC_SHIFT_MAX - PAGE_SHIFT) /* * Kmalloc subsystem. */ #ifndef KMALLOC_MIN_SIZE #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW) #endif /* * This restriction comes from byte sized index implementation. * Page size is normally 2^12 bytes and, in this case, if we want to use * byte sized index which can represent 2^8 entries, the size of the object * should be equal or greater to 2^12 / 2^8 = 2^4 = 16. * If minimum size of kmalloc is less than 16, we use it as minimum object * size and give up to use byte sized index. */ #define SLAB_OBJ_MIN_SIZE (KMALLOC_MIN_SIZE < 16 ? \ (KMALLOC_MIN_SIZE) : 16) /* * Whenever changing this, take care of that kmalloc_type() and * create_kmalloc_caches() still work as intended. */ enum kmalloc_cache_type { KMALLOC_NORMAL = 0, KMALLOC_RECLAIM, #ifdef CONFIG_ZONE_DMA KMALLOC_DMA, #endif NR_KMALLOC_TYPES }; #ifndef CONFIG_SLOB extern struct kmem_cache * kmalloc_caches[NR_KMALLOC_TYPES][KMALLOC_SHIFT_HIGH + 1]; static __always_inline enum kmalloc_cache_type kmalloc_type(gfp_t flags) { #ifdef CONFIG_ZONE_DMA /* * The most common case is KMALLOC_NORMAL, so test for it * with a single branch for both flags. */ if (likely((flags & (__GFP_DMA | __GFP_RECLAIMABLE)) == 0)) return KMALLOC_NORMAL; /* * At least one of the flags has to be set. If both are, __GFP_DMA * is more important. */ return flags & __GFP_DMA ? KMALLOC_DMA : KMALLOC_RECLAIM; #else return flags & __GFP_RECLAIMABLE ? KMALLOC_RECLAIM : KMALLOC_NORMAL; #endif } /* * Figure out which kmalloc slab an allocation of a certain size * belongs to. * 0 = zero alloc * 1 = 65 .. 96 bytes * 2 = 129 .. 192 bytes * n = 2^(n-1)+1 .. 2^n */ static __always_inline unsigned int kmalloc_index(size_t size) { if (!size) return 0; if (size <= KMALLOC_MIN_SIZE) return KMALLOC_SHIFT_LOW; if (KMALLOC_MIN_SIZE <= 32 && size > 64 && size <= 96) return 1; if (KMALLOC_MIN_SIZE <= 64 && size > 128 && size <= 192) return 2; if (size <= 8) return 3; if (size <= 16) return 4; if (size <= 32) return 5; if (size <= 64) return 6; if (size <= 128) return 7; if (size <= 256) return 8; if (size <= 512) return 9; if (size <= 1024) return 10; if (size <= 2 * 1024) return 11; if (size <= 4 * 1024) return 12; if (size <= 8 * 1024) return 13; if (size <= 16 * 1024) return 14; if (size <= 32 * 1024) return 15; if (size <= 64 * 1024) return 16; if (size <= 128 * 1024) return 17; if (size <= 256 * 1024) return 18; if (size <= 512 * 1024) return 19; if (size <= 1024 * 1024) return 20; if (size <= 2 * 1024 * 1024) return 21; if (size <= 4 * 1024 * 1024) return 22; if (size <= 8 * 1024 * 1024) return 23; if (size <= 16 * 1024 * 1024) return 24; if (size <= 32 * 1024 * 1024) return 25; if (size <= 64 * 1024 * 1024) return 26; BUG(); /* Will never be reached. Needed because the compiler may complain */ return -1; } #endif /* !CONFIG_SLOB */ void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; void kmem_cache_free(struct kmem_cache *, void *); /* * Bulk allocation and freeing operations. These are accelerated in an * allocator specific way to avoid taking locks repeatedly or building * metadata structures unnecessarily. * * Note that interrupts must be enabled when calling these functions. */ void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **); int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **); /* * Caller must not use kfree_bulk() on memory not originally allocated * by kmalloc(), because the SLOB allocator cannot handle this. */ static __always_inline void kfree_bulk(size_t size, void **p) { kmem_cache_free_bulk(NULL, size, p); } #ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; #else static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __kmalloc(size, flags); } static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node) { return kmem_cache_alloc(s, flags); } #endif #ifdef CONFIG_TRACING extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t) __assume_slab_alignment __malloc; #ifdef CONFIG_NUMA extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) __assume_slab_alignment __malloc; #else static __always_inline void * kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { return kmem_cache_alloc_trace(s, gfpflags, size); } #endif /* CONFIG_NUMA */ #else /* CONFIG_TRACING */ static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t flags, size_t size) { void *ret = kmem_cache_alloc(s, flags); ret = kasan_kmalloc(s, ret, size, flags); return ret; } static __always_inline void * kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, int node, size_t size) { void *ret = kmem_cache_alloc_node(s, gfpflags, node); ret = kasan_kmalloc(s, ret, size, gfpflags); return ret; } #endif /* CONFIG_TRACING */ extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) __assume_page_alignment __malloc; #else static __always_inline void * kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { return kmalloc_order(size, flags, order); } #endif static __always_inline void *kmalloc_large(size_t size, gfp_t flags) { unsigned int order = get_order(size); return kmalloc_order_trace(size, flags, order); } /** * kmalloc - allocate memory * @size: how many bytes of memory are required. * @flags: the type of memory to allocate. * * kmalloc is the normal method of allocating memory * for objects smaller than page size in the kernel. * * The allocated object address is aligned to at least ARCH_KMALLOC_MINALIGN * bytes. For @size of power of two bytes, the alignment is also guaranteed * to be at least to the size. * * The @flags argument may be one of the GFP flags defined at * include/linux/gfp.h and described at * :ref:`Documentation/core-api/mm-api.rst <mm-api-gfp-flags>` * * The recommended usage of the @flags is described at * :ref:`Documentation/core-api/memory-allocation.rst <memory_allocation>` * * Below is a brief outline of the most useful GFP flags * * %GFP_KERNEL * Allocate normal kernel ram. May sleep. * * %GFP_NOWAIT * Allocation will not sleep. * * %GFP_ATOMIC * Allocation will not sleep. May use emergency pools. * * %GFP_HIGHUSER * Allocate memory from high memory on behalf of user. * * Also it is possible to set different flags by OR'ing * in one or more of the following additional @flags: * * %__GFP_HIGH * This allocation has high priority and may use emergency pools. * * %__GFP_NOFAIL * Indicate that this allocation is in no way allowed to fail * (think twice before using). * * %__GFP_NORETRY * If memory is not immediately available, * then give up at once. * * %__GFP_NOWARN * If allocation fails, don't issue any warnings. * * %__GFP_RETRY_MAYFAIL * Try really hard to succeed the allocation but fail * eventually. */ static __always_inline void *kmalloc(size_t size, gfp_t flags) { if (__builtin_constant_p(size)) { #ifndef CONFIG_SLOB unsigned int index; #endif if (size > KMALLOC_MAX_CACHE_SIZE) return kmalloc_large(size, flags); #ifndef CONFIG_SLOB index = kmalloc_index(size); if (!index) return ZERO_SIZE_PTR; return kmem_cache_alloc_trace( kmalloc_caches[kmalloc_type(flags)][index], flags, size); #endif } return __kmalloc(size, flags); } static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) { #ifndef CONFIG_SLOB if (__builtin_constant_p(size) && size <= KMALLOC_MAX_CACHE_SIZE) { unsigned int i = kmalloc_index(size); if (!i) return ZERO_SIZE_PTR; return kmem_cache_alloc_node_trace( kmalloc_caches[kmalloc_type(flags)][i], flags, node, size); } #endif return __kmalloc_node(size, flags, node); } /** * kmalloc_array - allocate memory for an array. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ static inline void *kmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) return kmalloc(bytes, flags); return __kmalloc(bytes, flags); } /** * kcalloc - allocate memory for an array. The memory is set to zero. * @n: number of elements. * @size: element size. * @flags: the type of memory to allocate (see kmalloc). */ static inline void *kcalloc(size_t n, size_t size, gfp_t flags) { return kmalloc_array(n, size, flags | __GFP_ZERO); } /* * kmalloc_track_caller is a special version of kmalloc that records the * calling function of the routine calling it for slab leak tracking instead * of just the calling function (confusing, eh?). * It's useful when the call to kmalloc comes from a widely-used standard * allocator where we care about the real place the memory allocation * request comes from. */ extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ __kmalloc_track_caller(size, flags, _RET_IP_) static inline void *kmalloc_array_node(size_t n, size_t size, gfp_t flags, int node) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; if (__builtin_constant_p(n) && __builtin_constant_p(size)) return kmalloc_node(bytes, flags, node); return __kmalloc_node(bytes, flags, node); } static inline void *kcalloc_node(size_t n, size_t size, gfp_t flags, int node) { return kmalloc_array_node(n, size, flags | __GFP_ZERO, node); } #ifdef CONFIG_NUMA extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ _RET_IP_) #else /* CONFIG_NUMA */ #define kmalloc_node_track_caller(size, flags, node) \ kmalloc_track_caller(size, flags) #endif /* CONFIG_NUMA */ /* * Shortcuts */ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags) { return kmem_cache_alloc(k, flags | __GFP_ZERO); } /** * kzalloc - allocate memory. The memory is set to zero. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). */ static inline void *kzalloc(size_t size, gfp_t flags) { return kmalloc(size, flags | __GFP_ZERO); } /** * kzalloc_node - allocate zeroed memory from a particular memory node. * @size: how many bytes of memory are required. * @flags: the type of memory to allocate (see kmalloc). * @node: memory node from which to allocate */ static inline void *kzalloc_node(size_t size, gfp_t flags, int node) { return kmalloc_node(size, flags | __GFP_ZERO, node); } unsigned int kmem_cache_size(struct kmem_cache *s); void __init kmem_cache_init_late(void); #if defined(CONFIG_SMP) && defined(CONFIG_SLAB) int slab_prepare_cpu(unsigned int cpu); int slab_dead_cpu(unsigned int cpu); #else #define slab_prepare_cpu NULL #define slab_dead_cpu NULL #endif #endif /* _LINUX_SLAB_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_DISK_H #define _SCSI_DISK_H /* * More than enough for everybody ;) The huge number of majors * is a leftover from 16bit dev_t days, we don't really need that * much numberspace. */ #define SD_MAJORS 16 /* * Time out in seconds for disks and Magneto-opticals (which are slower). */ #define SD_TIMEOUT (30 * HZ) #define SD_MOD_TIMEOUT (75 * HZ) /* * Flush timeout is a multiplier over the standard device timeout which is * user modifiable via sysfs but initially set to SD_TIMEOUT */ #define SD_FLUSH_TIMEOUT_MULTIPLIER 2 #define SD_WRITE_SAME_TIMEOUT (120 * HZ) /* * Number of allowed retries */ #define SD_MAX_RETRIES 5 #define SD_PASSTHROUGH_RETRIES 1 #define SD_MAX_MEDIUM_TIMEOUTS 2 /* * Size of the initial data buffer for mode and read capacity data */ #define SD_BUF_SIZE 512 /* * Number of sectors at the end of the device to avoid multi-sector * accesses to in the case of last_sector_bug */ #define SD_LAST_BUGGY_SECTORS 8 enum { SD_EXT_CDB_SIZE = 32, /* Extended CDB size */ SD_MEMPOOL_SIZE = 2, /* CDB pool size */ }; enum { SD_DEF_XFER_BLOCKS = 0xffff, SD_MAX_XFER_BLOCKS = 0xffffffff, SD_MAX_WS10_BLOCKS = 0xffff, SD_MAX_WS16_BLOCKS = 0x7fffff, }; enum { SD_LBP_FULL = 0, /* Full logical block provisioning */ SD_LBP_UNMAP, /* Use UNMAP command */ SD_LBP_WS16, /* Use WRITE SAME(16) with UNMAP bit */ SD_LBP_WS10, /* Use WRITE SAME(10) with UNMAP bit */ SD_LBP_ZERO, /* Use WRITE SAME(10) with zero payload */ SD_LBP_DISABLE, /* Discard disabled due to failed cmd */ }; enum { SD_ZERO_WRITE = 0, /* Use WRITE(10/16) command */ SD_ZERO_WS, /* Use WRITE SAME(10/16) command */ SD_ZERO_WS16_UNMAP, /* Use WRITE SAME(16) with UNMAP */ SD_ZERO_WS10_UNMAP, /* Use WRITE SAME(10) with UNMAP */ }; struct scsi_disk { struct scsi_driver *driver; /* always &sd_template */ struct scsi_device *device; struct device dev; struct gendisk *disk; struct opal_dev *opal_dev; #ifdef CONFIG_BLK_DEV_ZONED u32 nr_zones; u32 rev_nr_zones; u32 zone_blocks; u32 rev_zone_blocks; u32 zones_optimal_open; u32 zones_optimal_nonseq; u32 zones_max_open; u32 *zones_wp_offset; spinlock_t zones_wp_offset_lock; u32 *rev_wp_offset; struct mutex rev_mutex; struct work_struct zone_wp_offset_work; char *zone_wp_update_buf; #endif atomic_t openers; sector_t capacity; /* size in logical blocks */ int max_retries; u32 max_xfer_blocks; u32 opt_xfer_blocks; u32 max_ws_blocks; u32 max_unmap_blocks; u32 unmap_granularity; u32 unmap_alignment; u32 index; unsigned int physical_block_size; unsigned int max_medium_access_timeouts; unsigned int medium_access_timed_out; u8 media_present; u8 write_prot; u8 protection_type;/* Data Integrity Field */ u8 provisioning_mode; u8 zeroing_mode; unsigned ATO : 1; /* state of disk ATO bit */ unsigned cache_override : 1; /* temp override of WCE,RCD */ unsigned WCE : 1; /* state of disk WCE bit */ unsigned RCD : 1; /* state of disk RCD bit, unused */ unsigned DPOFUA : 1; /* state of disk DPOFUA bit */ unsigned first_scan : 1; unsigned lbpme : 1; unsigned lbprz : 1; unsigned lbpu : 1; unsigned lbpws : 1; unsigned lbpws10 : 1; unsigned lbpvpd : 1; unsigned ws10 : 1; unsigned ws16 : 1; unsigned rc_basis: 2; unsigned zoned: 2; unsigned urswrz : 1; unsigned security : 1; unsigned ignore_medium_access_errors : 1; }; #define to_scsi_disk(obj) container_of(obj,struct scsi_disk,dev) static inline struct scsi_disk *scsi_disk(struct gendisk *disk) { return container_of(disk->private_data, struct scsi_disk, driver); } #define sd_printk(prefix, sdsk, fmt, a...) \ (sdsk)->disk ? \ sdev_prefix_printk(prefix, (sdsk)->device, \ (sdsk)->disk->disk_name, fmt, ##a) : \ sdev_printk(prefix, (sdsk)->device, fmt, ##a) #define sd_first_printk(prefix, sdsk, fmt, a...) \ do { \ if ((sdsk)->first_scan) \ sd_printk(prefix, sdsk, fmt, ##a); \ } while (0) static inline int scsi_medium_access_command(struct scsi_cmnd *scmd) { switch (scmd->cmnd[0]) { case READ_6: case READ_10: case READ_12: case READ_16: case SYNCHRONIZE_CACHE: case VERIFY: case VERIFY_12: case VERIFY_16: case WRITE_6: case WRITE_10: case WRITE_12: case WRITE_16: case WRITE_SAME: case WRITE_SAME_16: case UNMAP: return 1; case VARIABLE_LENGTH_CMD: switch (scmd->cmnd[9]) { case READ_32: case VERIFY_32: case WRITE_32: case WRITE_SAME_32: return 1; } } return 0; } static inline sector_t logical_to_sectors(struct scsi_device *sdev, sector_t blocks) { return blocks << (ilog2(sdev->sector_size) - 9); } static inline unsigned int logical_to_bytes(struct scsi_device *sdev, sector_t blocks) { return blocks * sdev->sector_size; } static inline sector_t bytes_to_logical(struct scsi_device *sdev, unsigned int bytes) { return bytes >> ilog2(sdev->sector_size); } static inline sector_t sectors_to_logical(struct scsi_device *sdev, sector_t sector) { return sector >> (ilog2(sdev->sector_size) - 9); } #ifdef CONFIG_BLK_DEV_INTEGRITY extern void sd_dif_config_host(struct scsi_disk *); #else /* CONFIG_BLK_DEV_INTEGRITY */ static inline void sd_dif_config_host(struct scsi_disk *disk) { } #endif /* CONFIG_BLK_DEV_INTEGRITY */ static inline int sd_is_zoned(struct scsi_disk *sdkp) { return sdkp->zoned == 1 || sdkp->device->type == TYPE_ZBC; } #ifdef CONFIG_BLK_DEV_ZONED void sd_zbc_release_disk(struct scsi_disk *sdkp); int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buffer); int sd_zbc_revalidate_zones(struct scsi_disk *sdkp); blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all); unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr); int sd_zbc_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, unsigned int nr_blocks); #else /* CONFIG_BLK_DEV_ZONED */ static inline void sd_zbc_release_disk(struct scsi_disk *sdkp) {} static inline int sd_zbc_read_zones(struct scsi_disk *sdkp, unsigned char *buf) { return 0; } static inline int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) { return 0; } static inline blk_status_t sd_zbc_setup_zone_mgmt_cmnd(struct scsi_cmnd *cmd, unsigned char op, bool all) { return BLK_STS_TARGET; } static inline unsigned int sd_zbc_complete(struct scsi_cmnd *cmd, unsigned int good_bytes, struct scsi_sense_hdr *sshdr) { return good_bytes; } static inline blk_status_t sd_zbc_prepare_zone_append(struct scsi_cmnd *cmd, sector_t *lba, unsigned int nr_blocks) { return BLK_STS_TARGET; } #define sd_zbc_report_zones NULL #endif /* CONFIG_BLK_DEV_ZONED */ void sd_print_sense_hdr(struct scsi_disk *sdkp, struct scsi_sense_hdr *sshdr); void sd_print_result(const struct scsi_disk *sdkp, const char *msg, int result); #endif /* _SCSI_DISK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FIB_RULES_H #define __NET_FIB_RULES_H #include <linux/types.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/fib_rules.h> #include <linux/refcount.h> #include <net/flow.h> #include <net/rtnetlink.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> struct fib_kuid_range { kuid_t start; kuid_t end; }; struct fib_rule { struct list_head list; int iifindex; int oifindex; u32 mark; u32 mark_mask; u32 flags; u32 table; u8 action; u8 l3mdev; u8 proto; u8 ip_proto; u32 target; __be64 tun_id; struct fib_rule __rcu *ctarget; struct net *fr_net; refcount_t refcnt; u32 pref; int suppress_ifgroup; int suppress_prefixlen; char iifname[IFNAMSIZ]; char oifname[IFNAMSIZ]; struct fib_kuid_range uid_range; struct fib_rule_port_range sport_range; struct fib_rule_port_range dport_range; struct rcu_head rcu; }; struct fib_lookup_arg { void *lookup_ptr; const void *lookup_data; void *result; struct fib_rule *rule; u32 table; int flags; #define FIB_LOOKUP_NOREF 1 #define FIB_LOOKUP_IGNORE_LINKSTATE 2 }; struct fib_rules_ops { int family; struct list_head list; int rule_size; int addr_size; int unresolved_rules; int nr_goto_rules; unsigned int fib_rules_seq; int (*action)(struct fib_rule *, struct flowi *, int, struct fib_lookup_arg *); bool (*suppress)(struct fib_rule *, int, struct fib_lookup_arg *); int (*match)(struct fib_rule *, struct flowi *, int); int (*configure)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *, struct nlattr **, struct netlink_ext_ack *); int (*delete)(struct fib_rule *); int (*compare)(struct fib_rule *, struct fib_rule_hdr *, struct nlattr **); int (*fill)(struct fib_rule *, struct sk_buff *, struct fib_rule_hdr *); size_t (*nlmsg_payload)(struct fib_rule *); /* Called after modifications to the rules set, must flush * the route cache if one exists. */ void (*flush_cache)(struct fib_rules_ops *ops); int nlgroup; const struct nla_policy *policy; struct list_head rules_list; struct module *owner; struct net *fro_net; struct rcu_head rcu; }; struct fib_rule_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_rule *rule; }; #define FRA_GENERIC_POLICY \ [FRA_UNSPEC] = { .strict_start_type = FRA_DPORT_RANGE + 1 }, \ [FRA_IIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \ [FRA_OIFNAME] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 }, \ [FRA_PRIORITY] = { .type = NLA_U32 }, \ [FRA_FWMARK] = { .type = NLA_U32 }, \ [FRA_TUN_ID] = { .type = NLA_U64 }, \ [FRA_FWMASK] = { .type = NLA_U32 }, \ [FRA_TABLE] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_PREFIXLEN] = { .type = NLA_U32 }, \ [FRA_SUPPRESS_IFGROUP] = { .type = NLA_U32 }, \ [FRA_GOTO] = { .type = NLA_U32 }, \ [FRA_L3MDEV] = { .type = NLA_U8 }, \ [FRA_UID_RANGE] = { .len = sizeof(struct fib_rule_uid_range) }, \ [FRA_PROTOCOL] = { .type = NLA_U8 }, \ [FRA_IP_PROTO] = { .type = NLA_U8 }, \ [FRA_SPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) }, \ [FRA_DPORT_RANGE] = { .len = sizeof(struct fib_rule_port_range) } static inline void fib_rule_get(struct fib_rule *rule) { refcount_inc(&rule->refcnt); } static inline void fib_rule_put(struct fib_rule *rule) { if (refcount_dec_and_test(&rule->refcnt)) kfree_rcu(rule, rcu); } #ifdef CONFIG_NET_L3_MASTER_DEV static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->l3mdev ? arg->table : rule->table; } #else static inline u32 fib_rule_get_table(struct fib_rule *rule, struct fib_lookup_arg *arg) { return rule->table; } #endif static inline u32 frh_get_table(struct fib_rule_hdr *frh, struct nlattr **nla) { if (nla[FRA_TABLE]) return nla_get_u32(nla[FRA_TABLE]); return frh->table; } static inline bool fib_rule_port_range_set(const struct fib_rule_port_range *range) { return range->start != 0 && range->end != 0; } static inline bool fib_rule_port_inrange(const struct fib_rule_port_range *a, __be16 port) { return ntohs(port) >= a->start && ntohs(port) <= a->end; } static inline bool fib_rule_port_range_valid(const struct fib_rule_port_range *a) { return a->start != 0 && a->end != 0 && a->end < 0xffff && a->start <= a->end; } static inline bool fib_rule_port_range_compare(struct fib_rule_port_range *a, struct fib_rule_port_range *b) { return a->start == b->start && a->end == b->end; } static inline bool fib_rule_requires_fldissect(struct fib_rule *rule) { return rule->iifindex != LOOPBACK_IFINDEX && (rule->ip_proto || fib_rule_port_range_set(&rule->sport_range) || fib_rule_port_range_set(&rule->dport_range)); } struct fib_rules_ops *fib_rules_register(const struct fib_rules_ops *, struct net *); void fib_rules_unregister(struct fib_rules_ops *); int fib_rules_lookup(struct fib_rules_ops *, struct flowi *, int flags, struct fib_lookup_arg *); int fib_default_rule_add(struct fib_rules_ops *, u32 pref, u32 table, u32 flags); bool fib_rule_matchall(const struct fib_rule *rule); int fib_rules_dump(struct net *net, struct notifier_block *nb, int family, struct netlink_ext_ack *extack); unsigned int fib_rules_seq_read(struct net *net, int family); int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack); INDIRECT_CALLABLE_DECLARE(int fib6_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)); INDIRECT_CALLABLE_DECLARE(int fib6_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib6_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); INDIRECT_CALLABLE_DECLARE(bool fib4_rule_suppress(struct fib_rule *rule, int flags, struct fib_lookup_arg *arg)); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 /* SPDX-License-Identifier: GPL-2.0 */ /* * Routines to manage notifier chains for passing status changes to any * interested routines. We need this instead of hard coded call lists so * that modules can poke their nose into the innards. The network devices * needed them so here they are for the rest of you. * * Alan Cox <Alan.Cox@linux.org> */ #ifndef _LINUX_NOTIFIER_H #define _LINUX_NOTIFIER_H #include <linux/errno.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/srcu.h> /* * Notifier chains are of four types: * * Atomic notifier chains: Chain callbacks run in interrupt/atomic * context. Callouts are not allowed to block. * Blocking notifier chains: Chain callbacks run in process context. * Callouts are allowed to block. * Raw notifier chains: There are no restrictions on callbacks, * registration, or unregistration. All locking and protection * must be provided by the caller. * SRCU notifier chains: A variant of blocking notifier chains, with * the same restrictions. * * atomic_notifier_chain_register() may be called from an atomic context, * but blocking_notifier_chain_register() and srcu_notifier_chain_register() * must be called from a process context. Ditto for the corresponding * _unregister() routines. * * atomic_notifier_chain_unregister(), blocking_notifier_chain_unregister(), * and srcu_notifier_chain_unregister() _must not_ be called from within * the call chain. * * SRCU notifier chains are an alternative form of blocking notifier chains. * They use SRCU (Sleepable Read-Copy Update) instead of rw-semaphores for * protection of the chain links. This means there is _very_ low overhead * in srcu_notifier_call_chain(): no cache bounces and no memory barriers. * As compensation, srcu_notifier_chain_unregister() is rather expensive. * SRCU notifier chains should be used when the chain will be called very * often but notifier_blocks will seldom be removed. */ struct notifier_block; typedef int (*notifier_fn_t)(struct notifier_block *nb, unsigned long action, void *data); struct notifier_block { notifier_fn_t notifier_call; struct notifier_block __rcu *next; int priority; }; struct atomic_notifier_head { spinlock_t lock; struct notifier_block __rcu *head; }; struct blocking_notifier_head { struct rw_semaphore rwsem; struct notifier_block __rcu *head; }; struct raw_notifier_head { struct notifier_block __rcu *head; }; struct srcu_notifier_head { struct mutex mutex; struct srcu_struct srcu; struct notifier_block __rcu *head; }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ spin_lock_init(&(name)->lock); \ (name)->head = NULL; \ } while (0) #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ init_rwsem(&(name)->rwsem); \ (name)->head = NULL; \ } while (0) #define RAW_INIT_NOTIFIER_HEAD(name) do { \ (name)->head = NULL; \ } while (0) /* srcu_notifier_heads must be cleaned up dynamically */ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); #define srcu_cleanup_notifier_head(name) \ cleanup_srcu_struct(&(name)->srcu); #define ATOMIC_NOTIFIER_INIT(name) { \ .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ .head = NULL } #define BLOCKING_NOTIFIER_INIT(name) { \ .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ .head = NULL } #define RAW_NOTIFIER_INIT(name) { \ .head = NULL } #define SRCU_NOTIFIER_INIT(name, pcpu) \ { \ .mutex = __MUTEX_INITIALIZER(name.mutex), \ .head = NULL, \ .srcu = __SRCU_STRUCT_INIT(name.srcu, pcpu), \ } #define ATOMIC_NOTIFIER_HEAD(name) \ struct atomic_notifier_head name = \ ATOMIC_NOTIFIER_INIT(name) #define BLOCKING_NOTIFIER_HEAD(name) \ struct blocking_notifier_head name = \ BLOCKING_NOTIFIER_INIT(name) #define RAW_NOTIFIER_HEAD(name) \ struct raw_notifier_head name = \ RAW_NOTIFIER_INIT(name) #ifdef CONFIG_TREE_SRCU #define _SRCU_NOTIFIER_HEAD(name, mod) \ static DEFINE_PER_CPU(struct srcu_data, name##_head_srcu_data); \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name##_head_srcu_data) #else #define _SRCU_NOTIFIER_HEAD(name, mod) \ mod struct srcu_notifier_head name = \ SRCU_NOTIFIER_INIT(name, name) #endif #define SRCU_NOTIFIER_HEAD(name) \ _SRCU_NOTIFIER_HEAD(name, /* not static */) #define SRCU_NOTIFIER_HEAD_STATIC(name) \ _SRCU_NOTIFIER_HEAD(name, static) #ifdef __KERNEL__ extern int atomic_notifier_chain_register(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_register(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_register(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_register(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, struct notifier_block *nb); extern int blocking_notifier_chain_unregister(struct blocking_notifier_head *nh, struct notifier_block *nb); extern int raw_notifier_chain_unregister(struct raw_notifier_head *nh, struct notifier_block *nb); extern int srcu_notifier_chain_unregister(struct srcu_notifier_head *nh, struct notifier_block *nb); extern int atomic_notifier_call_chain(struct atomic_notifier_head *nh, unsigned long val, void *v); extern int blocking_notifier_call_chain(struct blocking_notifier_head *nh, unsigned long val, void *v); extern int raw_notifier_call_chain(struct raw_notifier_head *nh, unsigned long val, void *v); extern int srcu_notifier_call_chain(struct srcu_notifier_head *nh, unsigned long val, void *v); extern int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int blocking_notifier_call_chain_robust(struct blocking_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); extern int raw_notifier_call_chain_robust(struct raw_notifier_head *nh, unsigned long val_up, unsigned long val_down, void *v); #define NOTIFY_DONE 0x0000 /* Don't care */ #define NOTIFY_OK 0x0001 /* Suits me */ #define NOTIFY_STOP_MASK 0x8000 /* Don't call further */ #define NOTIFY_BAD (NOTIFY_STOP_MASK|0x0002) /* Bad/Veto action */ /* * Clean way to return from the notifier and stop further calls. */ #define NOTIFY_STOP (NOTIFY_OK|NOTIFY_STOP_MASK) /* Encapsulate (negative) errno value (in particular, NOTIFY_BAD <=> EPERM). */ static inline int notifier_from_errno(int err) { if (err) return NOTIFY_STOP_MASK | (NOTIFY_OK - err); return NOTIFY_OK; } /* Restore (negative) errno value from notify return value. */ static inline int notifier_to_errno(int ret) { ret &= ~NOTIFY_STOP_MASK; return ret > NOTIFY_OK ? NOTIFY_OK - ret : 0; } /* * Declared notifiers so far. I can imagine quite a few more chains * over time (eg laptop power reset chains, reboot chain (to clean * device units up), device [un]mount chain, module load/unload chain, * low memory chain, screenblank chain (for plug in modular screenblankers) * VC switch chains (for loadable kernel svgalib VC switch helpers) etc... */ /* CPU notfiers are defined in include/linux/cpu.h. */ /* netdevice notifiers are defined in include/linux/netdevice.h */ /* reboot notifiers are defined in include/linux/reboot.h. */ /* Hibernation and suspend events are defined in include/linux/suspend.h. */ /* Virtual Terminal events are defined in include/linux/vt.h. */ #define NETLINK_URELEASE 0x0001 /* Unicast netlink socket released */ /* Console keyboard events. * Note: KBD_KEYCODE is always sent before KBD_UNBOUND_KEYCODE, KBD_UNICODE and * KBD_KEYSYM. */ #define KBD_KEYCODE 0x0001 /* Keyboard keycode, called before any other */ #define KBD_UNBOUND_KEYCODE 0x0002 /* Keyboard keycode which is not bound to any other */ #define KBD_UNICODE 0x0003 /* Keyboard unicode */ #define KBD_KEYSYM 0x0004 /* Keyboard keysym */ #define KBD_POST_KEYSYM 0x0005 /* Called after keyboard keysym interpretation */ extern struct blocking_notifier_head reboot_notifier_list; #endif /* __KERNEL__ */ #endif /* _LINUX_NOTIFIER_H */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/balloc.c * * Copyright (C) 1992, 1993, 1994, 1995 * Remy Card (card@masi.ibp.fr) * Laboratoire MASI - Institut Blaise Pascal * Universite Pierre et Marie Curie (Paris VI) * * Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993 * Big-endian to little-endian byte-swapping/bitmaps by * David S. Miller (davem@caip.rutgers.edu), 1995 */ #include <linux/time.h> #include <linux/capability.h> #include <linux/fs.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include "ext4.h" #include "ext4_jbd2.h" #include "mballoc.h" #include <trace/events/ext4.h> static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group); /* * balloc.c contains the blocks allocation and deallocation routines */ /* * Calculate block group number for a given block number */ ext4_group_t ext4_get_group_number(struct super_block *sb, ext4_fsblk_t block) { ext4_group_t group; if (test_opt2(sb, STD_GROUP_SIZE)) group = (block - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) >> (EXT4_BLOCK_SIZE_BITS(sb) + EXT4_CLUSTER_BITS(sb) + 3); else ext4_get_group_no_and_offset(sb, block, &group, NULL); return group; } /* * Calculate the block group number and offset into the block/cluster * allocation bitmap, given a block number */ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; ext4_grpblk_t offset; blocknr = blocknr - le32_to_cpu(es->s_first_data_block); offset = do_div(blocknr, EXT4_BLOCKS_PER_GROUP(sb)) >> EXT4_SB(sb)->s_cluster_bits; if (offsetp) *offsetp = offset; if (blockgrpp) *blockgrpp = blocknr; } /* * Check whether the 'block' lives within the 'block_group'. Returns 1 if so * and 0 otherwise. */ static inline int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block, ext4_group_t block_group) { ext4_group_t actual_group; actual_group = ext4_get_group_number(sb, block); return (actual_group == block_group) ? 1 : 0; } /* Return the number of clusters used for file system metadata; this * represents the overhead needed by the file system. */ static unsigned ext4_num_overhead_clusters(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned num_clusters; int block_cluster = -1, inode_cluster = -1, itbl_cluster = -1, i, c; ext4_fsblk_t start = ext4_group_first_block_no(sb, block_group); ext4_fsblk_t itbl_blk; struct ext4_sb_info *sbi = EXT4_SB(sb); /* This is the number of clusters used by the superblock, * block group descriptors, and reserved block group * descriptor blocks */ num_clusters = ext4_num_base_meta_clusters(sb, block_group); /* * For the allocation bitmaps and inode table, we first need * to check to see if the block is in the block group. If it * is, then check to see if the cluster is already accounted * for in the clusters used for the base metadata cluster, or * if we can increment the base metadata cluster to include * that block. Otherwise, we will have to track the cluster * used for the allocation bitmap or inode table explicitly. * Normally all of these blocks are contiguous, so the special * case handling shouldn't be necessary except for *very* * unusual file system layouts. */ if (ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp), block_group)) { block_cluster = EXT4_B2C(sbi, ext4_block_bitmap(sb, gdp) - start); if (block_cluster < num_clusters) block_cluster = -1; else if (block_cluster == num_clusters) { num_clusters++; block_cluster = -1; } } if (ext4_block_in_group(sb, ext4_inode_bitmap(sb, gdp), block_group)) { inode_cluster = EXT4_B2C(sbi, ext4_inode_bitmap(sb, gdp) - start); if (inode_cluster < num_clusters) inode_cluster = -1; else if (inode_cluster == num_clusters) { num_clusters++; inode_cluster = -1; } } itbl_blk = ext4_inode_table(sb, gdp); for (i = 0; i < sbi->s_itb_per_group; i++) { if (ext4_block_in_group(sb, itbl_blk + i, block_group)) { c = EXT4_B2C(sbi, itbl_blk + i - start); if ((c < num_clusters) || (c == inode_cluster) || (c == block_cluster) || (c == itbl_cluster)) continue; if (c == num_clusters) { num_clusters++; continue; } num_clusters++; itbl_cluster = c; } } if (block_cluster != -1) num_clusters++; if (inode_cluster != -1) num_clusters++; return num_clusters; } static unsigned int num_clusters_in_group(struct super_block *sb, ext4_group_t block_group) { unsigned int blocks; if (block_group == ext4_get_groups_count(sb) - 1) { /* * Even though mke2fs always initializes the first and * last group, just in case some other tool was used, * we need to make sure we calculate the right free * blocks. */ blocks = ext4_blocks_count(EXT4_SB(sb)->s_es) - ext4_group_first_block_no(sb, block_group); } else blocks = EXT4_BLOCKS_PER_GROUP(sb); return EXT4_NUM_B2C(EXT4_SB(sb), blocks); } /* Initializes an uninitialized block bitmap */ static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { unsigned int bit, bit_max; struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_fsblk_t start, tmp; J_ASSERT_BH(bh, buffer_locked(bh)); /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT | EXT4_GROUP_INFO_IBITMAP_CORRUPT); return -EFSBADCRC; } memset(bh->b_data, 0, sb->s_blocksize); bit_max = ext4_num_base_meta_clusters(sb, block_group); if ((bit_max >> 3) >= bh->b_size) return -EFSCORRUPTED; for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); start = ext4_group_first_block_no(sb, block_group); /* Set bits for block and inode bitmaps, and inode table */ tmp = ext4_block_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_bitmap(sb, gdp); if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); tmp = ext4_inode_table(sb, gdp); for (; tmp < ext4_inode_table(sb, gdp) + sbi->s_itb_per_group; tmp++) { if (ext4_block_in_group(sb, tmp, block_group)) ext4_set_bit(EXT4_B2C(sbi, tmp - start), bh->b_data); } /* * Also if the number of blocks within the group is less than * the blocksize * 8 ( which is the size of bitmap ), set rest * of the block bitmap to 1 */ ext4_mark_bitmap_end(num_clusters_in_group(sb, block_group), sb->s_blocksize * 8, bh->b_data); return 0; } /* Return the number of free blocks in a block group. It is used when * the block bitmap is uninitialized, so we can't just count the bits * in the bitmap. */ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } /* * The free blocks are managed by bitmaps. A file system contains several * blocks groups. Each group contains 1 bitmap block for blocks, 1 bitmap * block for inodes, N blocks for the inode table and data blocks. * * The file system contains group descriptors which are located after the * super block. Each descriptor contains the number of the bitmap block and * the free blocks count in the block. The descriptors are loaded in memory * when a file system is mounted (see ext4_fill_super). */ /** * ext4_get_group_desc() -- load group descriptor from disk * @sb: super block * @block_group: given block group * @bh: pointer to the buffer head to store the block * group descriptor */ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { unsigned int group_desc; unsigned int offset; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh_p; if (block_group >= ngroups) { ext4_error(sb, "block_group >= groups_count - block_group = %u," " groups_count = %u", block_group, ngroups); return NULL; } group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); bh_p = sbi_array_rcu_deref(sbi, s_group_desc, group_desc); /* * sbi_array_rcu_deref returns with rcu unlocked, this is ok since * the pointer being dereferenced won't be dereferenced again. By * looking at the usage in add_new_gdb() the value isn't modified, * just the pointer, and so it remains valid. */ if (!bh_p) { ext4_error(sb, "Group descriptor not loaded - " "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } desc = (struct ext4_group_desc *)( (__u8 *)bh_p->b_data + offset * EXT4_DESC_SIZE(sb)); if (bh) *bh = bh_p; return desc; } /* * Return the block number which was discovered to be invalid, or 0 if * the block bitmap is valid. */ static ext4_fsblk_t ext4_valid_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_grpblk_t offset; ext4_grpblk_t next_zero_bit; ext4_grpblk_t max_bit = EXT4_CLUSTERS_PER_GROUP(sb); ext4_fsblk_t blk; ext4_fsblk_t group_first_block; if (ext4_has_feature_flex_bg(sb)) { /* with FLEX_BG, the inode/block bitmaps and itable * blocks may not be in the group at all * so the bitmap validation will be skipped for those groups * or it has to also read the block group where the bitmaps * are located to verify they are set. */ return 0; } group_first_block = ext4_group_first_block_no(sb, block_group); /* check whether block bitmap block number is set */ blk = ext4_block_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode bitmap block number is set */ blk = ext4_inode_bitmap(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || !ext4_test_bit(EXT4_B2C(sbi, offset), bh->b_data)) /* bad block bitmap */ return blk; /* check whether the inode table block number is set */ blk = ext4_inode_table(sb, desc); offset = blk - group_first_block; if (offset < 0 || EXT4_B2C(sbi, offset) >= max_bit || EXT4_B2C(sbi, offset + sbi->s_itb_per_group) >= max_bit) return blk; next_zero_bit = ext4_find_next_zero_bit(bh->b_data, EXT4_B2C(sbi, offset + sbi->s_itb_per_group), EXT4_B2C(sbi, offset)); if (next_zero_bit < EXT4_B2C(sbi, offset + sbi->s_itb_per_group)) /* bad bitmap for inode tables */ return blk; return 0; } static int ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_desc *desc, ext4_group_t block_group, struct buffer_head *bh) { ext4_fsblk_t blk; struct ext4_group_info *grp; if (EXT4_SB(sb)->s_mount_state & EXT4_FC_REPLAY) return 0; grp = ext4_get_group_info(sb, block_group); if (buffer_verified(bh)) return 0; if (EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return -EFSCORRUPTED; ext4_lock_group(sb, block_group); if (buffer_verified(bh)) goto verified; if (unlikely(!ext4_block_bitmap_csum_verify(sb, block_group, desc, bh) || ext4_simulate_fail(sb, EXT4_SIM_BBITMAP_CRC))) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: bad block bitmap checksum", block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSBADCRC; } blk = ext4_valid_block_bitmap(sb, desc, block_group, bh); if (unlikely(blk != 0)) { ext4_unlock_group(sb, block_group); ext4_error(sb, "bg %u: block %llu: invalid block bitmap", block_group, blk); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EFSCORRUPTED; } set_buffer_verified(bh); verified: ext4_unlock_group(sb, block_group); return 0; } /** * ext4_read_block_bitmap_nowait() * @sb: super block * @block_group: given block group * * Read the bitmap for a given block_group,and validate the * bits for block/inode/inode tables are set in the bitmaps * * Return buffer_head on success or an ERR_PTR in case of failure. */ struct buffer_head * ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group, bool ignore_locked) { struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); struct buffer_head *bh; ext4_fsblk_t bitmap_blk; int err; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return ERR_PTR(-EFSCORRUPTED); bitmap_blk = ext4_block_bitmap(sb, desc); if ((bitmap_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (bitmap_blk >= ext4_blocks_count(sbi->s_es))) { ext4_error(sb, "Invalid block bitmap block %llu in " "block_group %u", bitmap_blk, block_group); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return ERR_PTR(-EFSCORRUPTED); } bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { ext4_warning(sb, "Cannot get buffer for block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return ERR_PTR(-ENOMEM); } if (ignore_locked && buffer_locked(bh)) { /* buffer under IO already, return if called for prefetching */ put_bh(bh); return NULL; } if (bitmap_uptodate(bh)) goto verify; lock_buffer(bh); if (bitmap_uptodate(bh)) { unlock_buffer(bh); goto verify; } ext4_lock_group(sb, block_group); if (ext4_has_group_desc_csum(sb) && (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) { if (block_group == 0) { ext4_unlock_group(sb, block_group); unlock_buffer(bh); ext4_error(sb, "Block bitmap for bg 0 marked " "uninitialized"); err = -EFSCORRUPTED; goto out; } err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); set_buffer_verified(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); if (err) { ext4_error(sb, "Failed to init block bitmap for group " "%u: %d", block_group, err); goto out; } goto verify; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, * bitmap is also uptodate */ set_bitmap_uptodate(bh); unlock_buffer(bh); goto verify; } /* * submit the buffer_head for reading */ set_buffer_new(bh); trace_ext4_read_block_bitmap_load(sb, block_group, ignore_locked); ext4_read_bh_nowait(bh, REQ_META | REQ_PRIO | (ignore_locked ? REQ_RAHEAD : 0), ext4_end_bitmap_read); return bh; verify: err = ext4_validate_block_bitmap(sb, desc, block_group, bh); if (err) goto out; return bh; out: put_bh(bh); return ERR_PTR(err); } /* Returns 0 on success, -errno on error */ int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group, struct buffer_head *bh) { struct ext4_group_desc *desc; if (!buffer_new(bh)) return 0; desc = ext4_get_group_desc(sb, block_group, NULL); if (!desc) return -EFSCORRUPTED; wait_on_buffer(bh); ext4_simulate_fail_bh(sb, bh, EXT4_SIM_BBITMAP_EIO); if (!buffer_uptodate(bh)) { ext4_error_err(sb, EIO, "Cannot read block bitmap - " "block_group = %u, block_bitmap = %llu", block_group, (unsigned long long) bh->b_blocknr); ext4_mark_group_bitmap_corrupted(sb, block_group, EXT4_GROUP_INFO_BBITMAP_CORRUPT); return -EIO; } clear_buffer_new(bh); /* Panic or remount fs read-only if block bitmap is invalid */ return ext4_validate_block_bitmap(sb, desc, block_group, bh); } struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { struct buffer_head *bh; int err; bh = ext4_read_block_bitmap_nowait(sb, block_group, false); if (IS_ERR(bh)) return bh; err = ext4_wait_block_bitmap(sb, block_group, bh); if (err) { put_bh(bh); return ERR_PTR(err); } return bh; } /** * ext4_has_free_clusters() * @sbi: in-core super block structure. * @nclusters: number of needed blocks * @flags: flags from ext4_mb_new_blocks() * * Check if filesystem has nclusters free & available for allocation. * On success return 1, return 0 on failure. */ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { s64 free_clusters, dirty_clusters, rsv, resv_clusters; struct percpu_counter *fcc = &sbi->s_freeclusters_counter; struct percpu_counter *dcc = &sbi->s_dirtyclusters_counter; free_clusters = percpu_counter_read_positive(fcc); dirty_clusters = percpu_counter_read_positive(dcc); resv_clusters = atomic64_read(&sbi->s_resv_clusters); /* * r_blocks_count should always be multiple of the cluster ratio so * we are safe to do a plane bit shift only. */ rsv = (ext4_r_blocks_count(sbi->s_es) >> sbi->s_cluster_bits) + resv_clusters; if (free_clusters - (nclusters + rsv + dirty_clusters) < EXT4_FREECLUSTERS_WATERMARK) { free_clusters = percpu_counter_sum_positive(fcc); dirty_clusters = percpu_counter_sum_positive(dcc); } /* Check whether we have space after accounting for current * dirty clusters & root reserved clusters. */ if (free_clusters >= (rsv + nclusters + dirty_clusters)) return 1; /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || capable(CAP_SYS_RESOURCE) || (flags & EXT4_MB_USE_ROOT_BLOCKS)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) return 1; } /* No free blocks. Let's see if we can dip into reserved pool */ if (flags & EXT4_MB_USE_RESERVED) { if (free_clusters >= (nclusters + dirty_clusters)) return 1; } return 0; } int ext4_claim_free_clusters(struct ext4_sb_info *sbi, s64 nclusters, unsigned int flags) { if (ext4_has_free_clusters(sbi, nclusters, flags)) { percpu_counter_add(&sbi->s_dirtyclusters_counter, nclusters); return 0; } else return -ENOSPC; } /** * ext4_should_retry_alloc() - check if a block allocation should be retried * @sb: superblock * @retries: number of retry attempts made so far * * ext4_should_retry_alloc() is called when ENOSPC is returned while * attempting to allocate blocks. If there's an indication that a pending * journal transaction might free some space and allow another attempt to * succeed, this function will wait for the current or committing transaction * to complete and then return TRUE. */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { struct ext4_sb_info *sbi = EXT4_SB(sb); if (!sbi->s_journal) return 0; if (++(*retries) > 3) { percpu_counter_inc(&sbi->s_sra_exceeded_retry_limit); return 0; } /* * if there's no indication that blocks are about to be freed it's * possible we just missed a transaction commit that did so */ smp_mb(); if (sbi->s_mb_free_pending == 0) return ext4_has_free_clusters(sbi, 1, 0); /* * it's possible we've just missed a transaction commit here, * so ignore the returned status */ jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); (void) jbd2_journal_force_commit_nested(sbi->s_journal); return 1; } /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) * @count: pointer to total number of clusters needed * @errp: error code * * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned int flags, unsigned long *count, int *errp) { struct ext4_allocation_request ar; ext4_fsblk_t ret; memset(&ar, 0, sizeof(ar)); /* Fill with neighbour allocated blocks */ ar.inode = inode; ar.goal = goal; ar.len = count ? *count : 1; ar.flags = flags; ret = ext4_mb_new_blocks(handle, &ar, errp); if (count) *count = ar.len; /* * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { dquot_alloc_block_nofail(inode, EXT4_C2B(EXT4_SB(inode->i_sb), ar.len)); } return ret; } /** * ext4_count_free_clusters() -- count filesystem free clusters * @sb: superblock * * Adds up the number of free clusters from each block group. */ ext4_fsblk_t ext4_count_free_clusters(struct super_block *sb) { ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_info *grp; #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; desc_count = 0; bitmap_count = 0; gdp = NULL; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, i); if (IS_ERR(bitmap_bh)) { bitmap_bh = NULL; continue; } x = ext4_count_free(bitmap_bh->b_data, EXT4_CLUSTERS_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %u: stored = %d, counted = %u\n", i, ext4_free_group_clusters(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); printk(KERN_DEBUG "ext4_count_free_clusters: stored = %llu" ", computed = %llu, %llu\n", EXT4_NUM_B2C(EXT4_SB(sb), ext4_free_blocks_count(es)), desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; grp = NULL; if (EXT4_SB(sb)->s_group_info) grp = ext4_get_group_info(sb, i); if (!grp || !EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) desc_count += ext4_free_group_clusters(sb, gdp); } return desc_count; #endif } static inline int test_root(ext4_group_t a, int b) { while (1) { if (a < b) return 0; if (a == b) return 1; if ((a % b) != 0) return 0; a = a / b; } } /** * ext4_bg_has_super - number of blocks used by the superblock in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the superblock (primary or backup) * in this group. Currently this will be only 0 or 1. */ int ext4_bg_has_super(struct super_block *sb, ext4_group_t group) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; if (group == 0) return 1; if (ext4_has_feature_sparse_super2(sb)) { if (group == le32_to_cpu(es->s_backup_bgs[0]) || group == le32_to_cpu(es->s_backup_bgs[1])) return 1; return 0; } if ((group <= 1) || !ext4_has_feature_sparse_super(sb)) return 1; if (!(group & 1)) return 0; if (test_root(group, 3) || (test_root(group, 5)) || test_root(group, 7)) return 1; return 0; } static unsigned long ext4_bg_num_gdb_meta(struct super_block *sb, ext4_group_t group) { unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); ext4_group_t first = metagroup * EXT4_DESC_PER_BLOCK(sb); ext4_group_t last = first + EXT4_DESC_PER_BLOCK(sb) - 1; if (group == first || group == first + 1 || group == last) return 1; return 0; } static unsigned long ext4_bg_num_gdb_nometa(struct super_block *sb, ext4_group_t group) { if (!ext4_bg_has_super(sb, group)) return 0; if (ext4_has_feature_meta_bg(sb)) return le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); else return EXT4_SB(sb)->s_gdb_count; } /** * ext4_bg_num_gdb - number of blocks used by the group table in group * @sb: superblock for filesystem * @group: group number to check * * Return the number of blocks used by the group descriptor table * (primary or backup) in this group. In the future there may be a * different number of descriptor blocks in each group. */ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) { unsigned long first_meta_bg = le32_to_cpu(EXT4_SB(sb)->s_es->s_first_meta_bg); unsigned long metagroup = group / EXT4_DESC_PER_BLOCK(sb); if (!ext4_has_feature_meta_bg(sb) || metagroup < first_meta_bg) return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } /* * This function returns the number of file system metadata clusters at * the beginning of a block group, including the reserved gdt blocks. */ static unsigned ext4_num_base_meta_clusters(struct super_block *sb, ext4_group_t block_group) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned num; /* Check for superblock and gdt backups in this group */ num = ext4_bg_has_super(sb, block_group); if (!ext4_has_feature_meta_bg(sb) || block_group < le32_to_cpu(sbi->s_es->s_first_meta_bg) * sbi->s_desc_per_block) { if (num) { num += ext4_bg_num_gdb(sb, block_group); num += le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks); } } else { /* For META_BG_BLOCK_GROUPS */ num += ext4_bg_num_gdb(sb, block_group); } return EXT4_NUM_B2C(sbi, num); } /** * ext4_inode_to_goal_block - return a hint for block allocation * @inode: inode for block allocation * * Return the ideal location to start allocating blocks for a * newly created inode. */ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); ext4_group_t block_group; ext4_grpblk_t colour; int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb)); ext4_fsblk_t bg_start; ext4_fsblk_t last_block; block_group = ei->i_block_group; if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) { /* * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME * block groups per flexgroup, reserve the first block * group for directories and special files. Regular * files will start at the second block group. This * tends to speed up directory access and improves * fsck times. */ block_group &= ~(flex_size-1); if (S_ISREG(inode->i_mode)) block_group++; } bg_start = ext4_group_first_block_no(inode->i_sb, block_group); last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1; /* * If we are doing delayed allocation, we don't need take * colour into account. */ if (test_opt(inode->i_sb, DELALLOC)) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else colour = (task_pid_nr(current) % 16) * ((last_block - bg_start) / 16); return bg_start + colour; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_COMPAT_H #define _ASM_X86_COMPAT_H /* * Architecture specific compatibility types */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm/processor.h> #include <asm/user32.h> #include <asm/unistd.h> #include <asm-generic/compat.h> #define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" typedef u16 __compat_uid_t; typedef u16 __compat_gid_t; typedef u32 __compat_uid32_t; typedef u32 __compat_gid32_t; typedef u16 compat_mode_t; typedef u16 compat_dev_t; typedef u16 compat_nlink_t; typedef u16 compat_ipc_pid_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; u16 __pad1; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; compat_dev_t st_rdev; u16 __pad2; u32 st_size; u32 st_blksize; u32 st_blocks; u32 st_atime; u32 st_atime_nsec; u32 st_mtime; u32 st_mtime_nsec; u32 st_ctime; u32 st_ctime_nsec; u32 __unused4; u32 __unused5; }; struct compat_flock { short l_type; short l_whence; compat_off_t l_start; compat_off_t l_len; compat_pid_t l_pid; }; #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 /* * IA32 uses 4 byte alignment for 64 bit quantities, * so we need to pack this structure. */ struct compat_flock64 { short l_type; short l_whence; compat_loff_t l_start; compat_loff_t l_len; compat_pid_t l_pid; } __attribute__((packed)); struct compat_statfs { int f_type; int f_bsize; int f_blocks; int f_bfree; int f_bavail; int f_files; int f_ffree; compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; int f_flags; int f_spare[4]; }; #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ #define _COMPAT_NSIG 64 #define _COMPAT_NSIG_BPW 32 typedef u32 compat_sigset_word; #define COMPAT_OFF_T_MAX 0x7fffffff struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; __compat_gid32_t gid; __compat_uid32_t cuid; __compat_gid32_t cgid; unsigned short mode; unsigned short __pad1; unsigned short seq; unsigned short __pad2; compat_ulong_t unused1; compat_ulong_t unused2; }; struct compat_semid64_ds { struct compat_ipc64_perm sem_perm; compat_ulong_t sem_otime; compat_ulong_t sem_otime_high; compat_ulong_t sem_ctime; compat_ulong_t sem_ctime_high; compat_ulong_t sem_nsems; compat_ulong_t __unused3; compat_ulong_t __unused4; }; struct compat_msqid64_ds { struct compat_ipc64_perm msg_perm; compat_ulong_t msg_stime; compat_ulong_t msg_stime_high; compat_ulong_t msg_rtime; compat_ulong_t msg_rtime_high; compat_ulong_t msg_ctime; compat_ulong_t msg_ctime_high; compat_ulong_t msg_cbytes; compat_ulong_t msg_qnum; compat_ulong_t msg_qbytes; compat_pid_t msg_lspid; compat_pid_t msg_lrpid; compat_ulong_t __unused4; compat_ulong_t __unused5; }; struct compat_shmid64_ds { struct compat_ipc64_perm shm_perm; compat_size_t shm_segsz; compat_ulong_t shm_atime; compat_ulong_t shm_atime_high; compat_ulong_t shm_dtime; compat_ulong_t shm_dtime_high; compat_ulong_t shm_ctime; compat_ulong_t shm_ctime_high; compat_pid_t shm_cpid; compat_pid_t shm_lpid; compat_ulong_t shm_nattch; compat_ulong_t __unused4; compat_ulong_t __unused5; }; /* * The type of struct elf_prstatus.pr_reg in compatible core dumps. */ typedef struct user_regs_struct compat_elf_gregset_t; /* Full regset -- prstatus on x32, otherwise on ia32 */ #define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296) #define SET_PR_FPVALID(S, V, R) \ do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ while (0) #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif static inline void __user *arch_compat_alloc_user_space(long len) { compat_uptr_t sp; if (test_thread_flag(TIF_IA32)) { sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ sp = task_pt_regs(current)->sp - 128; } return (void __user *)round_down(sp - len, 16); } static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) return true; #endif return false; } static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); } #ifdef CONFIG_COMPAT static inline bool in_compat_syscall(void) { return in_32bit_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ #define compat_need_64bit_alignment_fixup in_ia32_syscall #endif struct compat_siginfo; #ifdef CONFIG_X86_X32_ABI int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); #define copy_siginfo_to_user32 copy_siginfo_to_user32 #endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 /* * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin * cleaned up code to current version of sparse and added the slicing-by-8 * algorithm to the closely similar existing slicing-by-4 algorithm. * * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! * Code was from the public domain, copyright abandoned. Code was * subsequently included in the kernel, thus was re-licensed under the * GNU GPL v2. * * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> * Same crc32 function was used in 5 other places in the kernel. * I made one version, and deleted the others. * There are various incantations of crc32(). Some use a seed of 0 or ~0. * Some xor at the end with ~0. The generic crc32() function takes * seed as an argument, and doesn't xor at the end. Then individual * users can do whatever they need. * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. * fs/jffs2 uses seed 0, doesn't xor with ~0. * fs/partitions/efi.c uses seed ~0, xor's with ~0. * * This source code is licensed under the GNU General Public License, * Version 2. See the file COPYING for more details. */ /* see: Documentation/staging/crc32.rst for a description of algorithms */ #include <linux/crc32.h> #include <linux/crc32poly.h> #include <linux/module.h> #include <linux/types.h> #include <linux/sched.h> #include "crc32defs.h" #if CRC_LE_BITS > 8 # define tole(x) ((__force u32) cpu_to_le32(x)) #else # define tole(x) (x) #endif #if CRC_BE_BITS > 8 # define tobe(x) ((__force u32) cpu_to_be32(x)) #else # define tobe(x) (x) #endif #include "crc32table.h" MODULE_AUTHOR("Matt Domsch <Matt_Domsch@dell.com>"); MODULE_DESCRIPTION("Various CRC32 calculations"); MODULE_LICENSE("GPL"); #if CRC_LE_BITS > 8 || CRC_BE_BITS > 8 /* implements slicing-by-4 or slicing-by-8 algorithm */ static inline u32 __pure crc32_body(u32 crc, unsigned char const *buf, size_t len, const u32 (*tab)[256]) { # ifdef __LITTLE_ENDIAN # define DO_CRC(x) crc = t0[(crc ^ (x)) & 255] ^ (crc >> 8) # define DO_CRC4 (t3[(q) & 255] ^ t2[(q >> 8) & 255] ^ \ t1[(q >> 16) & 255] ^ t0[(q >> 24) & 255]) # define DO_CRC8 (t7[(q) & 255] ^ t6[(q >> 8) & 255] ^ \ t5[(q >> 16) & 255] ^ t4[(q >> 24) & 255]) # else # define DO_CRC(x) crc = t0[((crc >> 24) ^ (x)) & 255] ^ (crc << 8) # define DO_CRC4 (t0[(q) & 255] ^ t1[(q >> 8) & 255] ^ \ t2[(q >> 16) & 255] ^ t3[(q >> 24) & 255]) # define DO_CRC8 (t4[(q) & 255] ^ t5[(q >> 8) & 255] ^ \ t6[(q >> 16) & 255] ^ t7[(q >> 24) & 255]) # endif const u32 *b; size_t rem_len; # ifdef CONFIG_X86 size_t i; # endif const u32 *t0=tab[0], *t1=tab[1], *t2=tab[2], *t3=tab[3]; # if CRC_LE_BITS != 32 const u32 *t4 = tab[4], *t5 = tab[5], *t6 = tab[6], *t7 = tab[7]; # endif u32 q; /* Align it */ if (unlikely((long)buf & 3 && len)) { do { DO_CRC(*buf++); } while ((--len) && ((long)buf)&3); } # if CRC_LE_BITS == 32 rem_len = len & 3; len = len >> 2; # else rem_len = len & 7; len = len >> 3; # endif b = (const u32 *)buf; # ifdef CONFIG_X86 --b; for (i = 0; i < len; i++) { # else for (--b; len; --len) { # endif q = crc ^ *++b; /* use pre increment for speed */ # if CRC_LE_BITS == 32 crc = DO_CRC4; # else crc = DO_CRC8; q = *++b; crc ^= DO_CRC4; # endif } len = rem_len; /* And the last few bytes */ if (len) { u8 *p = (u8 *)(b + 1) - 1; # ifdef CONFIG_X86 for (i = 0; i < len; i++) DO_CRC(*++p); /* use pre increment for speed */ # else do { DO_CRC(*++p); /* use pre increment for speed */ } while (--len); # endif } return crc; #undef DO_CRC #undef DO_CRC4 #undef DO_CRC8 } #endif /** * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II * CRC32/CRC32C * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other * uses, or the previous crc32/crc32c value if computing incrementally. * @p: pointer to buffer over which CRC32/CRC32C is run * @len: length of buffer @p * @tab: little-endian Ethernet table * @polynomial: CRC32/CRC32c LE polynomial */ static inline u32 __pure crc32_le_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], u32 polynomial) { #if CRC_LE_BITS == 1 int i; while (len--) { crc ^= *p++; for (i = 0; i < 8; i++) crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); } # elif CRC_LE_BITS == 2 while (len--) { crc ^= *p++; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; crc = (crc >> 2) ^ tab[0][crc & 3]; } # elif CRC_LE_BITS == 4 while (len--) { crc ^= *p++; crc = (crc >> 4) ^ tab[0][crc & 15]; crc = (crc >> 4) ^ tab[0][crc & 15]; } # elif CRC_LE_BITS == 8 /* aka Sarwate algorithm */ while (len--) { crc ^= *p++; crc = (crc >> 8) ^ tab[0][crc & 255]; } # else crc = (__force u32) __cpu_to_le32(crc); crc = crc32_body(crc, p, len, tab); crc = __le32_to_cpu((__force __le32)crc); #endif return crc; } #if CRC_LE_BITS == 1 u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32_POLY_LE); } u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, NULL, CRC32C_POLY_LE); } #else u32 __pure __weak crc32_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, (const u32 (*)[256])crc32table_le, CRC32_POLY_LE); } u32 __pure __weak __crc32c_le(u32 crc, unsigned char const *p, size_t len) { return crc32_le_generic(crc, p, len, (const u32 (*)[256])crc32ctable_le, CRC32C_POLY_LE); } #endif EXPORT_SYMBOL(crc32_le); EXPORT_SYMBOL(__crc32c_le); u32 __pure crc32_le_base(u32, unsigned char const *, size_t) __alias(crc32_le); u32 __pure __crc32c_le_base(u32, unsigned char const *, size_t) __alias(__crc32c_le); /* * This multiplies the polynomials x and y modulo the given modulus. * This follows the "little-endian" CRC convention that the lsbit * represents the highest power of x, and the msbit represents x^0. */ static u32 __attribute_const__ gf2_multiply(u32 x, u32 y, u32 modulus) { u32 product = x & 1 ? y : 0; int i; for (i = 0; i < 31; i++) { product = (product >> 1) ^ (product & 1 ? modulus : 0); x >>= 1; product ^= x & 1 ? y : 0; } return product; } /** * crc32_generic_shift - Append @len 0 bytes to crc, in logarithmic time * @crc: The original little-endian CRC (i.e. lsbit is x^31 coefficient) * @len: The number of bytes. @crc is multiplied by x^(8*@len) * @polynomial: The modulus used to reduce the result to 32 bits. * * It's possible to parallelize CRC computations by computing a CRC * over separate ranges of a buffer, then summing them. * This shifts the given CRC by 8*len bits (i.e. produces the same effect * as appending len bytes of zero to the data), in time proportional * to log(len). */ static u32 __attribute_const__ crc32_generic_shift(u32 crc, size_t len, u32 polynomial) { u32 power = polynomial; /* CRC of x^32 */ int i; /* Shift up to 32 bits in the simple linear way */ for (i = 0; i < 8 * (int)(len & 3); i++) crc = (crc >> 1) ^ (crc & 1 ? polynomial : 0); len >>= 2; if (!len) return crc; for (;;) { /* "power" is x^(2^i), modulo the polynomial */ if (len & 1) crc = gf2_multiply(crc, power, polynomial); len >>= 1; if (!len) break; /* Square power, advancing to x^(2^(i+1)) */ power = gf2_multiply(power, power, polynomial); } return crc; } u32 __attribute_const__ crc32_le_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32_POLY_LE); } u32 __attribute_const__ __crc32c_le_shift(u32 crc, size_t len) { return crc32_generic_shift(crc, len, CRC32C_POLY_LE); } EXPORT_SYMBOL(crc32_le_shift); EXPORT_SYMBOL(__crc32c_le_shift); /** * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for * other uses, or the previous crc32 value if computing incrementally. * @p: pointer to buffer over which CRC32 is run * @len: length of buffer @p * @tab: big-endian Ethernet table * @polynomial: CRC32 BE polynomial */ static inline u32 __pure crc32_be_generic(u32 crc, unsigned char const *p, size_t len, const u32 (*tab)[256], u32 polynomial) { #if CRC_BE_BITS == 1 int i; while (len--) { crc ^= *p++ << 24; for (i = 0; i < 8; i++) crc = (crc << 1) ^ ((crc & 0x80000000) ? polynomial : 0); } # elif CRC_BE_BITS == 2 while (len--) { crc ^= *p++ << 24; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; crc = (crc << 2) ^ tab[0][crc >> 30]; } # elif CRC_BE_BITS == 4 while (len--) { crc ^= *p++ << 24; crc = (crc << 4) ^ tab[0][crc >> 28]; crc = (crc << 4) ^ tab[0][crc >> 28]; } # elif CRC_BE_BITS == 8 while (len--) { crc ^= *p++ << 24; crc = (crc << 8) ^ tab[0][crc >> 24]; } # else crc = (__force u32) __cpu_to_be32(crc); crc = crc32_body(crc, p, len, tab); crc = __be32_to_cpu((__force __be32)crc); # endif return crc; } #if CRC_BE_BITS == 1 u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, NULL, CRC32_POLY_BE); } #else u32 __pure crc32_be(u32 crc, unsigned char const *p, size_t len) { return crc32_be_generic(crc, p, len, (const u32 (*)[256])crc32table_be, CRC32_POLY_BE); } #endif EXPORT_SYMBOL(crc32_be);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_H #define _LINUX_SCHED_TASK_H /* * Interface between the scheduler and various task lifetime (fork()/exit()) * functionality: */ #include <linux/sched.h> #include <linux/uaccess.h> struct task_struct; struct rusage; union thread_union; struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; struct cgroup *cgrp; struct css_set *cset; }; /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but * _adding_ to the beginning of the run-queue has * a separate lock). */ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; extern union thread_union init_thread_union; extern struct task_struct init_task; #ifdef CONFIG_PROVE_RCU extern int lockdep_tasklist_lock_is_held(void); #endif /* #ifdef CONFIG_PROVE_RCU */ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); extern void proc_caches_init(void); extern void fork_init(void); extern void release_task(struct task_struct * p); extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *, unsigned long); extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD extern void exit_thread(struct task_struct *tsk); #else static inline void exit_thread(struct task_struct *tsk) { } #endif extern void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); #else #define sched_exec() {} #endif static inline struct task_struct *get_task_struct(struct task_struct *t) { refcount_inc(&t->usage); return t; } extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { if (refcount_dec_and_test(&t->usage)) __put_task_struct(t); } static inline void put_task_struct_many(struct task_struct *t, int nr) { if (refcount_sub_and_test(nr, &t->usage)) __put_task_struct(t); } void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else # define arch_task_struct_size (sizeof(struct task_struct)) #endif #ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST /* * If an architecture has not declared a thread_struct whitelist we * must assume something there may need to be copied to userspace. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = 0; /* Handle dynamically sized thread_struct. */ *size = arch_task_struct_size - offsetof(struct task_struct, thread); } #endif #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return t->stack_vm_area; } #else static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return NULL; } #endif /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), * neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { spin_lock(&p->alloc_lock); } static inline void task_unlock(struct task_struct *p) { spin_unlock(&p->alloc_lock); } #endif /* _LINUX_SCHED_TASK_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Credentials management - see Documentation/security/credentials.rst * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_CRED_H #define _LINUX_CRED_H #include <linux/capability.h> #include <linux/init.h> #include <linux/key.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/sched.h> #include <linux/sched/user.h> struct cred; struct inode; /* * COW Supplementary groups list */ struct group_info { atomic_t usage; int ngroups; kgid_t gid[0]; } __randomize_layout; /** * get_group_info - Get a reference to a group info structure * @group_info: The group info to reference * * This gets a reference to a set of supplementary groups. * * If the caller is accessing a task's credentials, they must hold the RCU read * lock when reading. */ static inline struct group_info *get_group_info(struct group_info *gi) { atomic_inc(&gi->usage); return gi; } /** * put_group_info - Release a reference to a group info structure * @group_info: The group info to release */ #define put_group_info(group_info) \ do { \ if (atomic_dec_and_test(&(group_info)->usage)) \ groups_free(group_info); \ } while (0) extern struct group_info init_groups; #ifdef CONFIG_MULTIUSER extern struct group_info *groups_alloc(int); extern void groups_free(struct group_info *); extern int in_group_p(kgid_t); extern int in_egroup_p(kgid_t); extern int groups_search(const struct group_info *, kgid_t); extern int set_current_groups(struct group_info *); extern void set_groups(struct cred *, struct group_info *); extern bool may_setgroups(void); extern void groups_sort(struct group_info *); #else static inline void groups_free(struct group_info *group_info) { } static inline int in_group_p(kgid_t grp) { return 1; } static inline int in_egroup_p(kgid_t grp) { return 1; } static inline int groups_search(const struct group_info *group_info, kgid_t grp) { return 1; } #endif /* * The security context of a task * * The parts of the context break down into two categories: * * (1) The objective context of a task. These parts are used when some other * task is attempting to affect this one. * * (2) The subjective context. These details are used when the task is acting * upon another object, be that a file, a task, a key or whatever. * * Note that some members of this structure belong to both categories - the * LSM security pointer for instance. * * A task has two security pointers. task->real_cred points to the objective * context that defines that task's actual details. The objective part of this * context is used whenever that task is acted upon. * * task->cred points to the subjective context that defines the details of how * that task is going to act upon another object. This may be overridden * temporarily to point to another security context, but normally points to the * same context as task->real_cred. */ struct cred { atomic_t usage; #ifdef CONFIG_DEBUG_CREDENTIALS atomic_t subscribers; /* number of processes subscribed */ void *put_addr; unsigned magic; #define CRED_MAGIC 0x43736564 #define CRED_MAGIC_DEAD 0x44656144 #endif kuid_t uid; /* real UID of the task */ kgid_t gid; /* real GID of the task */ kuid_t suid; /* saved UID of the task */ kgid_t sgid; /* saved GID of the task */ kuid_t euid; /* effective UID of the task */ kgid_t egid; /* effective GID of the task */ kuid_t fsuid; /* UID for VFS ops */ kgid_t fsgid; /* GID for VFS ops */ unsigned securebits; /* SUID-less security management */ kernel_cap_t cap_inheritable; /* caps our children can inherit */ kernel_cap_t cap_permitted; /* caps we're permitted */ kernel_cap_t cap_effective; /* caps we can actually use */ kernel_cap_t cap_bset; /* capability bounding set */ kernel_cap_t cap_ambient; /* Ambient capability set */ #ifdef CONFIG_KEYS unsigned char jit_keyring; /* default keyring to attach requested * keys to */ struct key *session_keyring; /* keyring inherited over fork */ struct key *process_keyring; /* keyring private to this process */ struct key *thread_keyring; /* keyring private to this thread */ struct key *request_key_auth; /* assumed request_key authority */ #endif #ifdef CONFIG_SECURITY void *security; /* subjective LSM security */ #endif struct user_struct *user; /* real user ID subscription */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct group_info *group_info; /* supplementary groups for euid/fsgid */ /* RCU deletion */ union { int non_rcu; /* Can we skip RCU deletion? */ struct rcu_head rcu; /* RCU deletion hook */ }; } __randomize_layout; extern void __put_cred(struct cred *); extern void exit_creds(struct task_struct *); extern int copy_creds(struct task_struct *, unsigned long); extern const struct cred *get_task_cred(struct task_struct *); extern struct cred *cred_alloc_blank(void); extern struct cred *prepare_creds(void); extern struct cred *prepare_exec_creds(void); extern int commit_creds(struct cred *); extern void abort_creds(struct cred *); extern const struct cred *override_creds(const struct cred *); extern void revert_creds(const struct cred *); extern struct cred *prepare_kernel_cred(struct task_struct *); extern int change_create_files_as(struct cred *, struct inode *); extern int set_security_override(struct cred *, u32); extern int set_security_override_from_ctx(struct cred *, const char *); extern int set_create_files_as(struct cred *, struct inode *); extern int cred_fscmp(const struct cred *, const struct cred *); extern void __init cred_init(void); /* * check for validity of credentials */ #ifdef CONFIG_DEBUG_CREDENTIALS extern void __invalid_creds(const struct cred *, const char *, unsigned); extern void __validate_process_creds(struct task_struct *, const char *, unsigned); extern bool creds_are_invalid(const struct cred *cred); static inline void __validate_creds(const struct cred *cred, const char *file, unsigned line) { if (unlikely(creds_are_invalid(cred))) __invalid_creds(cred, file, line); } #define validate_creds(cred) \ do { \ __validate_creds((cred), __FILE__, __LINE__); \ } while(0) #define validate_process_creds() \ do { \ __validate_process_creds(current, __FILE__, __LINE__); \ } while(0) extern void validate_creds_for_do_exit(struct task_struct *); #else static inline void validate_creds(const struct cred *cred) { } static inline void validate_creds_for_do_exit(struct task_struct *tsk) { } static inline void validate_process_creds(void) { } #endif static inline bool cap_ambient_invariant_ok(const struct cred *cred) { return cap_issubset(cred->cap_ambient, cap_intersect(cred->cap_permitted, cred->cap_inheritable)); } /** * get_new_cred - Get a reference on a new set of credentials * @cred: The new credentials to reference * * Get a reference on the specified set of new credentials. The caller must * release the reference. */ static inline struct cred *get_new_cred(struct cred *cred) { atomic_inc(&cred->usage); return cred; } /** * get_cred - Get a reference on a set of credentials * @cred: The credentials to reference * * Get a reference on the specified set of credentials. The caller must * release the reference. If %NULL is passed, it is returned with no action. * * This is used to deal with a committed set of credentials. Although the * pointer is const, this will temporarily discard the const and increment the * usage count. The purpose of this is to attempt to catch at compile time the * accidental alteration of a set of credentials that should be considered * immutable. */ static inline const struct cred *get_cred(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return cred; validate_creds(cred); nonconst_cred->non_rcu = 0; return get_new_cred(nonconst_cred); } static inline const struct cred *get_cred_rcu(const struct cred *cred) { struct cred *nonconst_cred = (struct cred *) cred; if (!cred) return NULL; if (!atomic_inc_not_zero(&nonconst_cred->usage)) return NULL; validate_creds(cred); nonconst_cred->non_rcu = 0; return cred; } /** * put_cred - Release a reference to a set of credentials * @cred: The credentials to release * * Release a reference to a set of credentials, deleting them when the last ref * is released. If %NULL is passed, nothing is done. * * This takes a const pointer to a set of credentials because the credentials * on task_struct are attached by const pointers to prevent accidental * alteration of otherwise immutable credential sets. */ static inline void put_cred(const struct cred *_cred) { struct cred *cred = (struct cred *) _cred; if (cred) { validate_creds(cred); if (atomic_dec_and_test(&(cred)->usage)) __put_cred(cred); } } /** * current_cred - Access the current task's subjective credentials * * Access the subjective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_cred() \ rcu_dereference_protected(current->cred, 1) /** * current_real_cred - Access the current task's objective credentials * * Access the objective credentials of the current task. RCU-safe, * since nobody else can modify it. */ #define current_real_cred() \ rcu_dereference_protected(current->real_cred, 1) /** * __task_cred - Access a task's objective credentials * @task: The task to query * * Access the objective credentials of a task. The caller must hold the RCU * readlock. * * The result of this function should not be passed directly to get_cred(); * rather get_task_cred() should be used instead. */ #define __task_cred(task) \ rcu_dereference((task)->real_cred) /** * get_current_cred - Get the current task's subjective credentials * * Get the subjective credentials of the current task, pinning them so that * they can't go away. Accessing the current task's credentials directly is * not permitted. */ #define get_current_cred() \ (get_cred(current_cred())) /** * get_current_user - Get the current task's user_struct * * Get the user record of the current task, pinning it so that it can't go * away. */ #define get_current_user() \ ({ \ struct user_struct *__u; \ const struct cred *__cred; \ __cred = current_cred(); \ __u = get_uid(__cred->user); \ __u; \ }) /** * get_current_groups - Get the current task's supplementary group list * * Get the supplementary group list of the current task, pinning it so that it * can't go away. */ #define get_current_groups() \ ({ \ struct group_info *__groups; \ const struct cred *__cred; \ __cred = current_cred(); \ __groups = get_group_info(__cred->group_info); \ __groups; \ }) #define task_cred_xxx(task, xxx) \ ({ \ __typeof__(((struct cred *)NULL)->xxx) ___val; \ rcu_read_lock(); \ ___val = __task_cred((task))->xxx; \ rcu_read_unlock(); \ ___val; \ }) #define task_uid(task) (task_cred_xxx((task), uid)) #define task_euid(task) (task_cred_xxx((task), euid)) #define current_cred_xxx(xxx) \ ({ \ current_cred()->xxx; \ }) #define current_uid() (current_cred_xxx(uid)) #define current_gid() (current_cred_xxx(gid)) #define current_euid() (current_cred_xxx(euid)) #define current_egid() (current_cred_xxx(egid)) #define current_suid() (current_cred_xxx(suid)) #define current_sgid() (current_cred_xxx(sgid)) #define current_fsuid() (current_cred_xxx(fsuid)) #define current_fsgid() (current_cred_xxx(fsgid)) #define current_cap() (current_cred_xxx(cap_effective)) #define current_user() (current_cred_xxx(user)) extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS #define current_user_ns() (current_cred_xxx(user_ns)) #else static inline struct user_namespace *current_user_ns(void) { return &init_user_ns; } #endif #define current_uid_gid(_uid, _gid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_uid) = __cred->uid; \ *(_gid) = __cred->gid; \ } while(0) #define current_euid_egid(_euid, _egid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_euid) = __cred->euid; \ *(_egid) = __cred->egid; \ } while(0) #define current_fsuid_fsgid(_fsuid, _fsgid) \ do { \ const struct cred *__cred; \ __cred = current_cred(); \ *(_fsuid) = __cred->fsuid; \ *(_fsgid) = __cred->fsgid; \ } while(0) #endif /* _LINUX_CRED_H */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 // SPDX-License-Identifier: GPL-2.0 /* File: fs/ext4/xattr.h On-disk format of extended attributes for the ext4 filesystem. (C) 2001 Andreas Gruenbacher, <a.gruenbacher@computer.org> */ #include <linux/xattr.h> /* Magic value in attribute blocks */ #define EXT4_XATTR_MAGIC 0xEA020000 /* Maximum number of references to one attribute block */ #define EXT4_XATTR_REFCOUNT_MAX 1024 /* Name indexes */ #define EXT4_XATTR_INDEX_USER 1 #define EXT4_XATTR_INDEX_POSIX_ACL_ACCESS 2 #define EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT 3 #define EXT4_XATTR_INDEX_TRUSTED 4 #define EXT4_XATTR_INDEX_LUSTRE 5 #define EXT4_XATTR_INDEX_SECURITY 6 #define EXT4_XATTR_INDEX_SYSTEM 7 #define EXT4_XATTR_INDEX_RICHACL 8 #define EXT4_XATTR_INDEX_ENCRYPTION 9 #define EXT4_XATTR_INDEX_HURD 10 /* Reserved for Hurd */ struct ext4_xattr_header { __le32 h_magic; /* magic number for identification */ __le32 h_refcount; /* reference count */ __le32 h_blocks; /* number of disk blocks used */ __le32 h_hash; /* hash value of all attributes */ __le32 h_checksum; /* crc32c(uuid+id+xattrblock) */ /* id = inum if refcount=1, blknum otherwise */ __u32 h_reserved[3]; /* zero right now */ }; struct ext4_xattr_ibody_header { __le32 h_magic; /* magic number for identification */ }; struct ext4_xattr_entry { __u8 e_name_len; /* length of name */ __u8 e_name_index; /* attribute name index */ __le16 e_value_offs; /* offset in disk block of value */ __le32 e_value_inum; /* inode in which the value is stored */ __le32 e_value_size; /* size of attribute value */ __le32 e_hash; /* hash value of name and value */ char e_name[]; /* attribute name */ }; #define EXT4_XATTR_PAD_BITS 2 #define EXT4_XATTR_PAD (1<<EXT4_XATTR_PAD_BITS) #define EXT4_XATTR_ROUND (EXT4_XATTR_PAD-1) #define EXT4_XATTR_LEN(name_len) \ (((name_len) + EXT4_XATTR_ROUND + \ sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) #define EXT4_XATTR_NEXT(entry) \ ((struct ext4_xattr_entry *)( \ (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len))) #define EXT4_XATTR_SIZE(size) \ (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) #define IHDR(inode, raw_inode) \ ((struct ext4_xattr_ibody_header *) \ ((void *)raw_inode + \ EXT4_GOOD_OLD_INODE_SIZE + \ EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) /* * XATTR_SIZE_MAX is currently 64k, but for the purposes of checking * for file system consistency errors, we use a somewhat bigger value. * This allows XATTR_SIZE_MAX to grow in the future, but by using this * instead of INT_MAX for certain consistency checks, we don't need to * worry about arithmetic overflows. (Actually XATTR_SIZE_MAX is * defined in include/uapi/linux/limits.h, so changing it is going * not going to be trivial....) */ #define EXT4_XATTR_SIZE_MAX (1 << 24) /* * The minimum size of EA value when you start storing it in an external inode * size of block - size of header - size of 1 entry - 4 null bytes */ #define EXT4_XATTR_MIN_LARGE_EA_SIZE(b) \ ((b) - EXT4_XATTR_LEN(3) - sizeof(struct ext4_xattr_header) - 4) #define BHDR(bh) ((struct ext4_xattr_header *)((bh)->b_data)) #define ENTRY(ptr) ((struct ext4_xattr_entry *)(ptr)) #define BFIRST(bh) ENTRY(BHDR(bh)+1) #define IS_LAST_ENTRY(entry) (*(__u32 *)(entry) == 0) #define EXT4_ZERO_XATTR_VALUE ((void *)-1) struct ext4_xattr_info { const char *name; const void *value; size_t value_len; int name_index; int in_inode; }; struct ext4_xattr_search { struct ext4_xattr_entry *first; void *base; void *end; struct ext4_xattr_entry *here; int not_found; }; struct ext4_xattr_ibody_find { struct ext4_xattr_search s; struct ext4_iloc iloc; }; struct ext4_xattr_inode_array { unsigned int count; /* # of used items in the array */ struct inode *inodes[]; }; extern const struct xattr_handler ext4_xattr_user_handler; extern const struct xattr_handler ext4_xattr_trusted_handler; extern const struct xattr_handler ext4_xattr_security_handler; extern const struct xattr_handler ext4_xattr_hurd_handler; #define EXT4_XATTR_NAME_ENCRYPTION_CONTEXT "c" /* * The EXT4_STATE_NO_EXPAND is overloaded and used for two purposes. * The first is to signal that there the inline xattrs and data are * taking up so much space that we might as well not keep trying to * expand it. The second is that xattr_sem is taken for writing, so * we shouldn't try to recurse into the inode expansion. For this * second case, we need to make sure that we take save and restore the * NO_EXPAND state flag appropriately. */ static inline void ext4_write_lock_xattr(struct inode *inode, int *save) { down_write(&EXT4_I(inode)->xattr_sem); *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); } static inline int ext4_write_trylock_xattr(struct inode *inode, int *save) { if (down_write_trylock(&EXT4_I(inode)->xattr_sem) == 0) return 0; *save = ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND); ext4_set_inode_state(inode, EXT4_STATE_NO_EXPAND); return 1; } static inline void ext4_write_unlock_xattr(struct inode *inode, int *save) { if (*save == 0) ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND); up_write(&EXT4_I(inode)->xattr_sem); } extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_credits(struct inode *inode, size_t value_len, bool is_create, int *credits); extern int __ext4_xattr_set_credits(struct super_block *sb, struct inode *inode, struct buffer_head *block_bh, size_t value_len, bool is_create); extern int ext4_xattr_delete_inode(handle_t *handle, struct inode *inode, struct ext4_xattr_inode_array **array, int extra_credits); extern void ext4_xattr_inode_array_free(struct ext4_xattr_inode_array *array); extern int ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, struct ext4_inode *raw_inode, handle_t *handle); extern const struct xattr_handler *ext4_xattr_handlers[]; extern int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); extern int ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, void *buffer, size_t buffer_size); extern int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode, struct ext4_xattr_info *i, struct ext4_xattr_ibody_find *is); extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); #else static inline int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr) { return 0; } #endif #ifdef CONFIG_LOCKDEP extern void ext4_xattr_inode_set_class(struct inode *ea_inode); #else static inline void ext4_xattr_inode_set_class(struct inode *ea_inode) { } #endif extern int ext4_get_inode_usage(struct inode *inode, qsize_t *usage);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Private definitions for the generic associative array implementation. * * See Documentation/core-api/assoc_array.rst for information. * * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_ASSOC_ARRAY_PRIV_H #define _LINUX_ASSOC_ARRAY_PRIV_H #ifdef CONFIG_ASSOCIATIVE_ARRAY #include <linux/assoc_array.h> #define ASSOC_ARRAY_FAN_OUT 16 /* Number of slots per node */ #define ASSOC_ARRAY_FAN_MASK (ASSOC_ARRAY_FAN_OUT - 1) #define ASSOC_ARRAY_LEVEL_STEP (ilog2(ASSOC_ARRAY_FAN_OUT)) #define ASSOC_ARRAY_LEVEL_STEP_MASK (ASSOC_ARRAY_LEVEL_STEP - 1) #define ASSOC_ARRAY_KEY_CHUNK_MASK (ASSOC_ARRAY_KEY_CHUNK_SIZE - 1) #define ASSOC_ARRAY_KEY_CHUNK_SHIFT (ilog2(BITS_PER_LONG)) /* * Undefined type representing a pointer with type information in the bottom * two bits. */ struct assoc_array_ptr; /* * An N-way node in the tree. * * Each slot contains one of four things: * * (1) Nothing (NULL). * * (2) A leaf object (pointer types 0). * * (3) A next-level node (pointer type 1, subtype 0). * * (4) A shortcut (pointer type 1, subtype 1). * * The tree is optimised for search-by-ID, but permits reasonable iteration * also. * * The tree is navigated by constructing an index key consisting of an array of * segments, where each segment is ilog2(ASSOC_ARRAY_FAN_OUT) bits in size. * * The segments correspond to levels of the tree (the first segment is used at * level 0, the second at level 1, etc.). */ struct assoc_array_node { struct assoc_array_ptr *back_pointer; u8 parent_slot; struct assoc_array_ptr *slots[ASSOC_ARRAY_FAN_OUT]; unsigned long nr_leaves_on_branch; }; /* * A shortcut through the index space out to where a collection of nodes/leaves * with the same IDs live. */ struct assoc_array_shortcut { struct assoc_array_ptr *back_pointer; int parent_slot; int skip_to_level; struct assoc_array_ptr *next_node; unsigned long index_key[]; }; /* * Preallocation cache. */ struct assoc_array_edit { struct rcu_head rcu; struct assoc_array *array; const struct assoc_array_ops *ops; const struct assoc_array_ops *ops_for_excised_subtree; struct assoc_array_ptr *leaf; struct assoc_array_ptr **leaf_p; struct assoc_array_ptr *dead_leaf; struct assoc_array_ptr *new_meta[3]; struct assoc_array_ptr *excised_meta[1]; struct assoc_array_ptr *excised_subtree; struct assoc_array_ptr **set_backpointers[ASSOC_ARRAY_FAN_OUT]; struct assoc_array_ptr *set_backpointers_to; struct assoc_array_node *adjust_count_on; long adjust_count_by; struct { struct assoc_array_ptr **ptr; struct assoc_array_ptr *to; } set[2]; struct { u8 *p; u8 to; } set_parent_slot[1]; u8 segment_cache[ASSOC_ARRAY_FAN_OUT + 1]; }; /* * Internal tree member pointers are marked in the bottom one or two bits to * indicate what type they are so that we don't have to look behind every * pointer to see what it points to. * * We provide functions to test type annotations and to create and translate * the annotated pointers. */ #define ASSOC_ARRAY_PTR_TYPE_MASK 0x1UL #define ASSOC_ARRAY_PTR_LEAF_TYPE 0x0UL /* Points to leaf (or nowhere) */ #define ASSOC_ARRAY_PTR_META_TYPE 0x1UL /* Points to node or shortcut */ #define ASSOC_ARRAY_PTR_SUBTYPE_MASK 0x2UL #define ASSOC_ARRAY_PTR_NODE_SUBTYPE 0x0UL #define ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE 0x2UL static inline bool assoc_array_ptr_is_meta(const struct assoc_array_ptr *x) { return (unsigned long)x & ASSOC_ARRAY_PTR_TYPE_MASK; } static inline bool assoc_array_ptr_is_leaf(const struct assoc_array_ptr *x) { return !assoc_array_ptr_is_meta(x); } static inline bool assoc_array_ptr_is_shortcut(const struct assoc_array_ptr *x) { return (unsigned long)x & ASSOC_ARRAY_PTR_SUBTYPE_MASK; } static inline bool assoc_array_ptr_is_node(const struct assoc_array_ptr *x) { return !assoc_array_ptr_is_shortcut(x); } static inline void *assoc_array_ptr_to_leaf(const struct assoc_array_ptr *x) { return (void *)((unsigned long)x & ~ASSOC_ARRAY_PTR_TYPE_MASK); } static inline unsigned long __assoc_array_ptr_to_meta(const struct assoc_array_ptr *x) { return (unsigned long)x & ~(ASSOC_ARRAY_PTR_SUBTYPE_MASK | ASSOC_ARRAY_PTR_TYPE_MASK); } static inline struct assoc_array_node *assoc_array_ptr_to_node(const struct assoc_array_ptr *x) { return (struct assoc_array_node *)__assoc_array_ptr_to_meta(x); } static inline struct assoc_array_shortcut *assoc_array_ptr_to_shortcut(const struct assoc_array_ptr *x) { return (struct assoc_array_shortcut *)__assoc_array_ptr_to_meta(x); } static inline struct assoc_array_ptr *__assoc_array_x_to_ptr(const void *p, unsigned long t) { return (struct assoc_array_ptr *)((unsigned long)p | t); } static inline struct assoc_array_ptr *assoc_array_leaf_to_ptr(const void *p) { return __assoc_array_x_to_ptr(p, ASSOC_ARRAY_PTR_LEAF_TYPE); } static inline struct assoc_array_ptr *assoc_array_node_to_ptr(const struct assoc_array_node *p) { return __assoc_array_x_to_ptr( p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_NODE_SUBTYPE); } static inline struct assoc_array_ptr *assoc_array_shortcut_to_ptr(const struct assoc_array_shortcut *p) { return __assoc_array_x_to_ptr( p, ASSOC_ARRAY_PTR_META_TYPE | ASSOC_ARRAY_PTR_SHORTCUT_SUBTYPE); } #endif /* CONFIG_ASSOCIATIVE_ARRAY */ #endif /* _LINUX_ASSOC_ARRAY_PRIV_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Framework and drivers for configuring and reading different PHYs * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c * * Author: Andy Fleming * * Copyright (c) 2004 Freescale Semiconductor, Inc. */ #ifndef __PHY_H #define __PHY_H #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/linkmode.h> #include <linux/netlink.h> #include <linux/mdio.h> #include <linux/mii.h> #include <linux/mii_timestamper.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/mod_devicetable.h> #include <linux/u64_stats_sync.h> #include <linux/irqreturn.h> #include <linux/iopoll.h> #include <linux/refcount.h> #include <linux/atomic.h> #define PHY_DEFAULT_FEATURES (SUPPORTED_Autoneg | \ SUPPORTED_TP | \ SUPPORTED_MII) #define PHY_10BT_FEATURES (SUPPORTED_10baseT_Half | \ SUPPORTED_10baseT_Full) #define PHY_100BT_FEATURES (SUPPORTED_100baseT_Half | \ SUPPORTED_100baseT_Full) #define PHY_1000BT_FEATURES (SUPPORTED_1000baseT_Half | \ SUPPORTED_1000baseT_Full) extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init; #define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features) #define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features) #define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features) #define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features) #define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features) #define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features) #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features) #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features) extern const int phy_basic_ports_array[3]; extern const int phy_fibre_port_array[1]; extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; extern const int phy_basic_t1_features_array[2]; extern const int phy_gbit_features_array[2]; extern const int phy_10gbit_features_array[1]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, * or not desired for this PHY. Set to PHY_IGNORE_INTERRUPT if * the attached driver handles the interrupt */ #define PHY_POLL -1 #define PHY_IGNORE_INTERRUPT -2 #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 #define PHY_POLL_CABLE_TEST 0x00000004 #define MDIO_DEVICE_IS_PHY 0x80000000 /** * enum phy_interface_t - Interface Mode definitions * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined * @PHY_INTERFACE_MODE_MII: Median-independent interface * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI * @PHY_INTERFACE_MODE_SMII: ??? MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, PHY_INTERFACE_MODE_MII, PHY_INTERFACE_MODE_GMII, PHY_INTERFACE_MODE_SGMII, PHY_INTERFACE_MODE_TBI, PHY_INTERFACE_MODE_REVMII, PHY_INTERFACE_MODE_RMII, PHY_INTERFACE_MODE_RGMII, PHY_INTERFACE_MODE_RGMII_ID, PHY_INTERFACE_MODE_RGMII_RXID, PHY_INTERFACE_MODE_RGMII_TXID, PHY_INTERFACE_MODE_RTBI, PHY_INTERFACE_MODE_SMII, PHY_INTERFACE_MODE_XGMII, PHY_INTERFACE_MODE_XLGMII, PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_QSGMII, PHY_INTERFACE_MODE_TRGMII, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ PHY_INTERFACE_MODE_10GBASER, PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_MAX, } phy_interface_t; /* * phy_supported_speeds - return all speeds currently supported by a PHY device */ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int *speeds, unsigned int size); /** * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { switch (interface) { case PHY_INTERFACE_MODE_NA: return ""; case PHY_INTERFACE_MODE_INTERNAL: return "internal"; case PHY_INTERFACE_MODE_MII: return "mii"; case PHY_INTERFACE_MODE_GMII: return "gmii"; case PHY_INTERFACE_MODE_SGMII: return "sgmii"; case PHY_INTERFACE_MODE_TBI: return "tbi"; case PHY_INTERFACE_MODE_REVMII: return "rev-mii"; case PHY_INTERFACE_MODE_RMII: return "rmii"; case PHY_INTERFACE_MODE_RGMII: return "rgmii"; case PHY_INTERFACE_MODE_RGMII_ID: return "rgmii-id"; case PHY_INTERFACE_MODE_RGMII_RXID: return "rgmii-rxid"; case PHY_INTERFACE_MODE_RGMII_TXID: return "rgmii-txid"; case PHY_INTERFACE_MODE_RTBI: return "rtbi"; case PHY_INTERFACE_MODE_SMII: return "smii"; case PHY_INTERFACE_MODE_XGMII: return "xgmii"; case PHY_INTERFACE_MODE_XLGMII: return "xlgmii"; case PHY_INTERFACE_MODE_MOCA: return "moca"; case PHY_INTERFACE_MODE_QSGMII: return "qsgmii"; case PHY_INTERFACE_MODE_TRGMII: return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_RXAUI: return "rxaui"; case PHY_INTERFACE_MODE_XAUI: return "xaui"; case PHY_INTERFACE_MODE_10GBASER: return "10gbase-r"; case PHY_INTERFACE_MODE_USXGMII: return "usxgmii"; case PHY_INTERFACE_MODE_10GKR: return "10gbase-kr"; default: return "unknown"; } } #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 #define PHY_MAX_ADDR 32 /* Used when trying to connect to a specific phy (mii bus id:phy device id) */ #define PHY_ID_FMT "%s:%02x" #define MII_BUS_ID_SIZE 61 struct device; struct phylink; struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; /** * struct mdio_bus_stats - Statistics counters for MDIO busses * @transfers: Total number of transfers, i.e. @writes + @reads * @errors: Number of MDIO transfers that returned an error * @writes: Number of write transfers * @reads: Number of read transfers * @syncp: Synchronisation for incrementing statistics */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; u64_stats_t writes; u64_stats_t reads; /* Must be last, add new statistics above */ struct u64_stats_sync syncp; }; /** * struct phy_package_shared - Shared information in PHY packages * @addr: Common PHY address used to combine PHYs in one package * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv * @priv: Driver private data shared across a PHY package * * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ struct phy_package_shared { int addr; refcount_t refcnt; unsigned long flags; size_t priv_size; /* private data pointer */ /* note that this pointer is shared between different phydevs and * the user has to take care of appropriate locking. It is allocated * and freed automatically by phy_package_join() and * phy_package_leave(). */ void *priv; }; /* used as bit number in atomic bitops */ #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 /** * struct mii_bus - Represents an MDIO bus * * @owner: Who owns this device * @name: User friendly name for this MDIO device, or driver name * @id: Unique identifier for this bus, typical from bus hierarchy * @priv: Driver private data * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ struct mii_bus { struct module *owner; const char *name; char id[MII_BUS_ID_SIZE]; void *priv; /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; /** * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; /** @parent: Parent device of this bus */ struct device *parent; /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; /** @dev: Kernel device representation */ struct device dev; /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; /** * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; /** @probe_capabilities: bus capabilities, used for probing */ enum { MDIOBUS_NO_CAP = 0, MDIOBUS_C22, MDIOBUS_C45, MDIOBUS_C22_C45, } probe_capabilities; /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) struct mii_bus *mdiobus_alloc_size(size_t size); /** * mdiobus_alloc - Allocate an MDIO bus structure * * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready * for the driver to register the bus. */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); } int __mdiobus_register(struct mii_bus *bus, struct module *owner); int __devm_mdiobus_register(struct device *dev, struct mii_bus *bus, struct module *owner); #define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) #define devm_mdiobus_register(dev, bus) \ __devm_mdiobus_register(dev, bus, THIS_MODULE) void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev) { return devm_mdiobus_alloc_size(dev, 0); } struct mii_bus *mdio_find_bus(const char *mdio_name); struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true /** * enum phy_state - PHY state machine states: * * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. * - PHY driver probe function will set the state to @PHY_READY * * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. * - timer moves to @PHY_NOLINK or @PHY_RUNNING * * @PHY_NOLINK: PHY is up, but not currently plugged in. * - irq or timer will set @PHY_RUNNING if link comes back * - phy_stop moves to @PHY_HALTED * * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets * - irq or timer will set @PHY_NOLINK if link goes down * - phy_stop moves to @PHY_HALTED * * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. * - phy_stop aborts the running test and moves to @PHY_HALTED * * @PHY_HALTED: PHY is up, but no polling or interrupts are done. Or * PHY is in an error state. * - phy_start moves to @PHY_UP */ enum phy_state { PHY_DOWN = 0, PHY_READY, PHY_HALTED, PHY_UP, PHY_RUNNING, PHY_NOLINK, PHY_CABLETEST, }; #define MDIO_MMD_NUM 32 /** * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. * @device_ids: The device identifer for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; u32 mmds_present; u32 device_ids[MDIO_MMD_NUM]; }; struct macsec_context; struct macsec_ops; /** * struct phy_device - An instance of a PHY * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. * @is_internal: Set to true if this PHY is internal to a MAC. * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. * @is_gigabit_capable: Set to true if PHY supports 1000Mbps * @has_fixups: Set to true if this PHY has fixups/quirks. * @suspended: Set to true if this PHY has been suspended successfully. * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phy_timer: The timer for handling the state machine * @phylink: Pointer to phylink instance for this PHY * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached * @sfp_bus: SFP bus attached to this PHY's fiber port * @attached_dev: The attached enet driver's device instance ptr * @adjust_link: Callback for the enet controller to respond to changes: in the * link state. * @phy_link_change: Callback for phylink for notification of link change * @macsec_ops: MACsec offloading ops. * * @speed: Current link speed * @duplex: Current duplex * @port: Current port * @pause: Current pause * @asym_pause: Current asymmetric pause * @supported: Combined MAC/PHY supported linkmodes * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @lp_advertising: Current link partner advertised linkmodes * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used * @link: Current link state * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover * @mdix_ctrl: User setting of crossover * @interrupts: Flag interrupts have been enabled * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics * @phy_led_triggers: Array of LED triggers * @phy_num_led_triggers: Number of triggers in @phy_led_triggers * @led_link_trigger: LED trigger for link up/down * @last_triggered: last LED trigger for link speed * @master_slave_set: User requested master/slave configuration * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration * @mii_ts: Pointer to time stamper callbacks * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling * and disabling specific interrupts * * Contains some infrastructure for polling and interrupt * handling, as well as handling shifts in PHY hardware state */ struct phy_device { struct mdio_device mdio; /* Information about the PHY type */ /* And management functions */ struct phy_driver *drv; u32 phy_id; struct phy_c45_device_ids c45_ids; unsigned is_c45:1; unsigned is_internal:1; unsigned is_pseudo_fixed_link:1; unsigned is_gigabit_capable:1; unsigned has_fixups:1; unsigned suspended:1; unsigned suspended_by_mdio_bus:1; unsigned sysfs_links:1; unsigned loopback_enabled:1; unsigned downshifted_rate:1; unsigned autoneg:1; /* The most recently read link state */ unsigned link:1; unsigned autoneg_complete:1; /* Interrupts are enabled */ unsigned interrupts:1; enum phy_state state; u32 dev_flags; phy_interface_t interface; /* * forced speed & duplex (no autoneg) * partner speed & duplex & pause (autoneg) */ int speed; int duplex; int port; int pause; int asym_pause; u8 master_slave_get; u8 master_slave_set; u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; unsigned int phy_num_led_triggers; struct phy_led_trigger *last_triggered; struct phy_led_trigger *led_link_trigger; #endif /* * Interrupt number for this PHY * -1 means no interrupt */ int irq; /* private data pointer */ /* For use by PHYs to maintain extra state */ void *priv; /* shared data pointer */ /* For use by PHYs inside the same package that need a shared state. */ struct phy_package_shared *shared; /* Reporting cable test results */ struct sk_buff *skb; void *ehdr; struct nlattr *nest; /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; struct mutex lock; /* This may be modified under the rtnl lock */ bool sfp_bus_attached; struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; struct mii_timestamper *mii_ts; u8 mdix; u8 mdix_ctrl; void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif }; #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) /** * struct phy_tdr_config - Configuration of a TDR raw test * * @first: Distance for first data collection point * @last: Distance for last data collection point * @step: Step between data collection points * @pair: Bitmap of cable pairs to collect data for * * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. * All distances are in centimeters. */ struct phy_tdr_config { u32 first; u32 last; u32 step; s8 pair; }; #define PHY_PAIR_ALL -1 /** * struct phy_driver - Driver structure for a particular PHY type * * @mdiodrv: Data common to all MDIO devices * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field * @name: The friendly name of this PHY type * @phy_id_mask: Defines the important bits of the phy_id * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. * Note that none of these functions should be called from * interrupt time. The goal is for the bus read/write functions * to be able to block when the bus transaction is happening, * and be freed up by an interrupt (The MPC85xx has this ability, * though it is not currently supported in the driver). */ struct phy_driver { struct mdio_driver_common mdiodrv; u32 phy_id; char *name; u32 phy_id_mask; const unsigned long * const features; u32 flags; const void *driver_data; /** * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); /** * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); /** * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); /** * @get_features: Probe the hardware to determine what * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /* PHY Power Management */ /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); /** * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); /** @ack_interrupt: Clears any pending interrupts */ int (*ack_interrupt)(struct phy_device *phydev); /** @config_intr: Enables or disables interrupts */ int (*config_intr)(struct phy_device *phydev); /** * @did_interrupt: Checks if the PHY generated an interrupt. * For multi-PHY devices with shared PHY interrupt pin * Set interrupt bits have to be cleared. */ int (*did_interrupt)(struct phy_device *phydev); /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); /** * @match_phy_device: Returns true if this is a suitable * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY * register changes to enable Wake on LAN, so set_wol is * provided to be called in the ethernet driver's set_wol * function. */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @get_wol: See set_wol, but for checking whether Wake on LAN * is enabled. */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @link_change_notify: Called to inform a PHY device driver * when the core is about to change the link state. This * callback is supposed to be used as fixup hook for drivers * that need to take action when the link state * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); /** * @read_mmd: PHY specific driver override for reading a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD read function * will be used by phy_read_mmd(), which will use either a * direct read for Clause 45 PHYs or an indirect read for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); /** * @write_mmd: PHY specific driver override for writing a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD write function * will be used by phy_write_mmd(), which will use either a * direct write for Clause 45 PHYs, or an indirect write for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. val is the value to be written. */ int (*write_mmd)(struct phy_device *dev, int devnum, u16 regnum, u16 val); /** @read_page: Return the current PHY register page number */ int (*read_page)(struct phy_device *dev); /** @write_page: Set the current PHY register page number */ int (*write_page)(struct phy_device *dev, int page); /** * @module_info: Get the size and type of the eeprom contained * within a plug-in module */ int (*module_info)(struct phy_device *dev, struct ethtool_modinfo *modinfo); /** * @module_eeprom: Get the eeprom information from the plug-in * module */ int (*module_eeprom)(struct phy_device *dev, struct ethtool_eeprom *ee, u8 *data); /** @cable_test_start: Start a cable test */ int (*cable_test_start)(struct phy_device *dev); /** @cable_test_tdr_start: Start a raw TDR cable test */ int (*cable_test_tdr_start)(struct phy_device *dev, const struct phy_tdr_config *config); /** * @cable_test_get_status: Once per second, or on interrupt, * request the status of the test. */ int (*cable_test_get_status)(struct phy_device *dev, bool *finished); /* Get statistics from the PHY using ethtool */ /** @get_sset_count: Number of statistic counters */ int (*get_sset_count)(struct phy_device *dev); /** @get_strings: Names of the statistic counters */ void (*get_strings)(struct phy_device *dev, u8 *data); /** @get_stats: Return the statistic counter values */ void (*get_stats)(struct phy_device *dev, struct ethtool_stats *stats, u64 *data); /* Get and Set PHY tunables */ /** @get_tunable: Return the value of a tunable */ int (*get_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, void *data); /** @set_tunable: Set the value of a tunable */ int (*set_tunable)(struct phy_device *dev, struct ethtool_tunable *tuna, const void *data); /** @set_loopback: Set the loopback mood of the PHY */ int (*set_loopback)(struct phy_device *dev, bool enable); /** @get_sqi: Get the signal quality indication */ int (*get_sqi)(struct phy_device *dev); /** @get_sqi_max: Get the maximum signal quality indication */ int (*get_sqi_max)(struct phy_device *dev); }; #define to_phy_driver(d) container_of(to_mdio_common_driver(d), \ struct phy_driver, mdiodrv) #define PHY_ANY_ID "MATCH ANY PHY" #define PHY_ANY_UID 0xffffffff #define PHY_ID_MATCH_EXACT(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 0) #define PHY_ID_MATCH_MODEL(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 4) #define PHY_ID_MATCH_VENDOR(id) .phy_id = (id), .phy_id_mask = GENMASK(31, 10) /* A Structure for boards to register fixups with the PHY Lib */ struct phy_fixup { struct list_head list; char bus_id[MII_BUS_ID_SIZE + 3]; u32 phy_uid; u32 phy_uid_mask; int (*run)(struct phy_device *phydev); }; const char *phy_speed_to_str(int speed); const char *phy_duplex_to_str(unsigned int duplex); /* A structure for mapping a particular speed and duplex * combination to a particular SUPPORTED and ADVERTISED value */ struct phy_setting { u32 speed; u8 duplex; u8 bit; }; const struct phy_setting * phy_lookup_setting(int speed, int duplex, const unsigned long *mask, bool exact); size_t phy_speeds(unsigned int *speeds, size_t size, unsigned long *mask); void of_set_phy_supported(struct phy_device *phydev); void of_set_phy_eee_broken(struct phy_device *phydev); int phy_speed_down_core(struct phy_device *phydev); /** * phy_is_started - Convenience function to check whether PHY is started * @phydev: The phy_device struct */ static inline bool phy_is_started(struct phy_device *phydev) { return phydev->state >= PHY_UP; } void phy_resolve_aneg_pause(struct phy_device *phydev); void phy_resolve_aneg_linkmode(struct phy_device *phydev); void phy_check_downshift(struct phy_device *phydev); /** * phy_read - Convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_read(struct phy_device *phydev, u32 regnum) { return mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } #define phy_read_poll_timeout(phydev, regnum, val, cond, sleep_us, \ timeout_us, sleep_before_read) \ ({ \ int __ret = read_poll_timeout(phy_read, val, (cond) || val < 0, \ sleep_us, timeout_us, sleep_before_read, phydev, regnum); \ if (val < 0) \ __ret = val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /** * __phy_read - convenience function for reading a given PHY register * @phydev: the phy_device struct * @regnum: register number to read * * The caller must have taken the MDIO bus lock. */ static inline int __phy_read(struct phy_device *phydev, u32 regnum) { return __mdiobus_read(phydev->mdio.bus, phydev->mdio.addr, regnum); } /** * phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * NOTE: MUST NOT be called from interrupt context, * because the bus read/write functions may wait for an interrupt * to conclude the operation. */ static inline int phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_write - Convenience function for writing a given PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: value to write to @regnum * * The caller must have taken the MDIO bus lock. */ static inline int __phy_write(struct phy_device *phydev, u32 regnum, u16 val) { return __mdiobus_write(phydev->mdio.bus, phydev->mdio.addr, regnum, val); } /** * __phy_modify_changed() - Convenience function for modifying a PHY register * @phydev: a pointer to a &struct phy_device * @regnum: register number * @mask: bit mask of bits to clear * @set: bit mask of bits to set * * Unlocked helper function which allows a PHY register to be modified as * new register value = (old register value & ~mask) | set * * Returns negative errno, 0 if there was no change, and 1 in case of change */ static inline int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set) { return __mdiobus_modify_changed(phydev->mdio.bus, phydev->mdio.addr, regnum, mask, set); } /* * phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /** * phy_read_mmd_poll_timeout - Periodically poll a PHY register until a * condition is met or a timeout occurs * * @phydev: The phy_device struct * @devaddr: The MMD to read from * @regnum: The register on the MMD to read * @val: Variable to read the register into * @cond: Break condition (usually involving @val) * @sleep_us: Maximum time to sleep between reads in us (0 * tight-loops). Should be less than ~20ms since usleep_range * is used (see Documentation/timers/timers-howto.rst). * @timeout_us: Timeout in us, 0 means never timeout * @sleep_before_read: if it is true, sleep @sleep_us before read. * Returns 0 on success and -ETIMEDOUT upon a timeout. In either * case, the last read value at @args is stored in @val. Must not * be called from atomic context if sleep_us or timeout_us are used. */ #define phy_read_mmd_poll_timeout(phydev, devaddr, regnum, val, cond, \ sleep_us, timeout_us, sleep_before_read) \ ({ \ int __ret = read_poll_timeout(phy_read_mmd, val, (cond) || val < 0, \ sleep_us, timeout_us, sleep_before_read, \ phydev, devaddr, regnum); \ if (val < 0) \ __ret = val; \ if (__ret) \ phydev_err(phydev, "%s failed: %d\n", __func__, __ret); \ __ret; \ }) /* * __phy_read_mmd - Convenience function for reading a register * from an MMD on a given PHY. */ int __phy_read_mmd(struct phy_device *phydev, int devad, u32 regnum); /* * phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); /* * __phy_write_mmd - Convenience function for writing a register * on an MMD on a given PHY. */ int __phy_write_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val); int __phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify_changed(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int phy_modify(struct phy_device *phydev, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd_changed(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int __phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); int phy_modify_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 mask, u16 set); /** * __phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, 0, val); } /** * __phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return __phy_modify(phydev, regnum, val, 0); } /** * phy_set_bits - Convenience function for setting bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to set */ static inline int phy_set_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, 0, val); } /** * phy_clear_bits - Convenience function for clearing bits in a PHY register * @phydev: the phy_device struct * @regnum: register number to write * @val: bits to clear */ static inline int phy_clear_bits(struct phy_device *phydev, u32 regnum, u16 val) { return phy_modify(phydev, regnum, val, 0); } /** * __phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set * * The caller must have taken the MDIO bus lock. */ static inline int __phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * __phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear * * The caller must have taken the MDIO bus lock. */ static inline int __phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return __phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_set_bits_mmd - Convenience function for setting bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to set */ static inline int phy_set_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, 0, val); } /** * phy_clear_bits_mmd - Convenience function for clearing bits in a register * on MMD * @phydev: the phy_device struct * @devad: the MMD containing register to modify * @regnum: register number to modify * @val: bits to clear */ static inline int phy_clear_bits_mmd(struct phy_device *phydev, int devad, u32 regnum, u16 val) { return phy_modify_mmd(phydev, devad, regnum, val, 0); } /** * phy_interrupt_is_valid - Convenience function for testing a given PHY irq * @phydev: the phy_device struct * * NOTE: must be kept in sync with addition/removal of PHY_POLL and * PHY_IGNORE_INTERRUPT */ static inline bool phy_interrupt_is_valid(struct phy_device *phydev) { return phydev->irq != PHY_POLL && phydev->irq != PHY_IGNORE_INTERRUPT; } /** * phy_polling_mode - Convenience function for testing whether polling is * used to detect PHY status changes * @phydev: the phy_device struct */ static inline bool phy_polling_mode(struct phy_device *phydev) { if (phydev->state == PHY_CABLETEST) if (phydev->drv->flags & PHY_POLL_CABLE_TEST) return true; return phydev->irq == PHY_POLL; } /** * phy_has_hwtstamp - Tests whether a PHY time stamp configuration. * @phydev: the phy_device struct */ static inline bool phy_has_hwtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->hwtstamp; } /** * phy_has_rxtstamp - Tests whether a PHY supports receive time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_rxtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->rxtstamp; } /** * phy_has_tsinfo - Tests whether a PHY reports time stamping and/or * PTP hardware clock capabilities. * @phydev: the phy_device struct */ static inline bool phy_has_tsinfo(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->ts_info; } /** * phy_has_txtstamp - Tests whether a PHY supports transmit time stamping. * @phydev: the phy_device struct */ static inline bool phy_has_txtstamp(struct phy_device *phydev) { return phydev && phydev->mii_ts && phydev->mii_ts->txtstamp; } static inline int phy_hwtstamp(struct phy_device *phydev, struct ifreq *ifr) { return phydev->mii_ts->hwtstamp(phydev->mii_ts, ifr); } static inline bool phy_rxtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { return phydev->mii_ts->rxtstamp(phydev->mii_ts, skb, type); } static inline int phy_ts_info(struct phy_device *phydev, struct ethtool_ts_info *tsinfo) { return phydev->mii_ts->ts_info(phydev->mii_ts, tsinfo); } static inline void phy_txtstamp(struct phy_device *phydev, struct sk_buff *skb, int type) { phydev->mii_ts->txtstamp(phydev->mii_ts, skb, type); } /** * phy_is_internal - Convenience function for testing if a PHY is internal * @phydev: the phy_device struct */ static inline bool phy_is_internal(struct phy_device *phydev) { return phydev->is_internal; } /** * phy_interface_mode_is_rgmii - Convenience function for testing if a * PHY interface mode is RGMII (all variants) * @mode: the &phy_interface_t enum */ static inline bool phy_interface_mode_is_rgmii(phy_interface_t mode) { return mode >= PHY_INTERFACE_MODE_RGMII && mode <= PHY_INTERFACE_MODE_RGMII_TXID; }; /** * phy_interface_mode_is_8023z() - does the PHY interface mode use 802.3z * negotiation * @mode: one of &enum phy_interface_t * * Returns true if the PHY interface mode uses the 16-bit negotiation * word as defined in 802.3z. (See 802.3-2015 37.2.1 Config_Reg encoding) */ static inline bool phy_interface_mode_is_8023z(phy_interface_t mode) { return mode == PHY_INTERFACE_MODE_1000BASEX || mode == PHY_INTERFACE_MODE_2500BASEX; } /** * phy_interface_is_rgmii - Convenience function for testing if a PHY interface * is RGMII (all variants) * @phydev: the phy_device struct */ static inline bool phy_interface_is_rgmii(struct phy_device *phydev) { return phy_interface_mode_is_rgmii(phydev->interface); }; /** * phy_is_pseudo_fixed_link - Convenience function for testing if this * PHY is the CPU port facing side of an Ethernet switch, or similar. * @phydev: the phy_device struct */ static inline bool phy_is_pseudo_fixed_link(struct phy_device *phydev) { return phydev->is_pseudo_fixed_link; } int phy_save_page(struct phy_device *phydev); int phy_select_page(struct phy_device *phydev, int page); int phy_restore_page(struct phy_device *phydev, int oldpage, int ret); int phy_read_paged(struct phy_device *phydev, int page, u32 regnum); int phy_write_paged(struct phy_device *phydev, int page, u32 regnum, u16 val); int phy_modify_paged_changed(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); int phy_modify_paged(struct phy_device *phydev, int page, u32 regnum, u16 mask, u16 set); struct phy_device *phy_device_create(struct mii_bus *bus, int addr, u32 phy_id, bool is_c45, struct phy_c45_device_ids *c45_ids); #if IS_ENABLED(CONFIG_PHYLIB) struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45); int phy_device_register(struct phy_device *phy); void phy_device_free(struct phy_device *phydev); #else static inline struct phy_device *get_phy_device(struct mii_bus *bus, int addr, bool is_c45) { return NULL; } static inline int phy_device_register(struct phy_device *phy) { return 0; } static inline void phy_device_free(struct phy_device *phydev) { } #endif /* CONFIG_PHYLIB */ void phy_device_remove(struct phy_device *phydev); int phy_init_hw(struct phy_device *phydev); int phy_suspend(struct phy_device *phydev); int phy_resume(struct phy_device *phydev); int __phy_resume(struct phy_device *phydev); int phy_loopback(struct phy_device *phydev, bool enable); void phy_sfp_attach(void *upstream, struct sfp_bus *bus); void phy_sfp_detach(void *upstream, struct sfp_bus *bus); int phy_sfp_probe(struct phy_device *phydev, const struct sfp_upstream_ops *ops); struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phy_interface_t interface); struct phy_device *phy_find_first(struct mii_bus *bus); int phy_attach_direct(struct net_device *dev, struct phy_device *phydev, u32 flags, phy_interface_t interface); int phy_connect_direct(struct net_device *dev, struct phy_device *phydev, void (*handler)(struct net_device *), phy_interface_t interface); struct phy_device *phy_connect(struct net_device *dev, const char *bus_id, void (*handler)(struct net_device *), phy_interface_t interface); void phy_disconnect(struct phy_device *phydev); void phy_detach(struct phy_device *phydev); void phy_start(struct phy_device *phydev); void phy_stop(struct phy_device *phydev); int phy_start_aneg(struct phy_device *phydev); int phy_aneg_done(struct phy_device *phydev); int phy_speed_down(struct phy_device *phydev, bool sync); int phy_speed_up(struct phy_device *phydev); int phy_restart_aneg(struct phy_device *phydev); int phy_reset_after_clk_enable(struct phy_device *phydev); #if IS_ENABLED(CONFIG_PHYLIB) int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack); int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config); #else static inline int phy_start_cable_test(struct phy_device *phydev, struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } static inline int phy_start_cable_test_tdr(struct phy_device *phydev, struct netlink_ext_ack *extack, const struct phy_tdr_config *config) { NL_SET_ERR_MSG(extack, "Kernel not compiled with PHYLIB support"); return -EOPNOTSUPP; } #endif int phy_cable_test_result(struct phy_device *phydev, u8 pair, u16 result); int phy_cable_test_fault_length(struct phy_device *phydev, u8 pair, u16 cm); static inline void phy_device_reset(struct phy_device *phydev, int value) { mdio_device_reset(&phydev->mdio, value); } #define phydev_err(_phydev, format, args...) \ dev_err(&_phydev->mdio.dev, format, ##args) #define phydev_info(_phydev, format, args...) \ dev_info(&_phydev->mdio.dev, format, ##args) #define phydev_warn(_phydev, format, args...) \ dev_warn(&_phydev->mdio.dev, format, ##args) #define phydev_dbg(_phydev, format, args...) \ dev_dbg(&_phydev->mdio.dev, format, ##args) static inline const char *phydev_name(const struct phy_device *phydev) { return dev_name(&phydev->mdio.dev); } static inline void phy_lock_mdio_bus(struct phy_device *phydev) { mutex_lock(&phydev->mdio.bus->mdio_lock); } static inline void phy_unlock_mdio_bus(struct phy_device *phydev) { mutex_unlock(&phydev->mdio.bus->mdio_lock); } void phy_attached_print(struct phy_device *phydev, const char *fmt, ...) __printf(2, 3); char *phy_attached_info_irq(struct phy_device *phydev) __malloc; void phy_attached_info(struct phy_device *phydev); /* Clause 22 PHY */ int genphy_read_abilities(struct phy_device *phydev); int genphy_setup_forced(struct phy_device *phydev); int genphy_restart_aneg(struct phy_device *phydev); int genphy_check_and_restart_aneg(struct phy_device *phydev, bool restart); int genphy_config_eee_advert(struct phy_device *phydev); int __genphy_config_aneg(struct phy_device *phydev, bool changed); int genphy_aneg_done(struct phy_device *phydev); int genphy_update_link(struct phy_device *phydev); int genphy_read_lpa(struct phy_device *phydev); int genphy_read_status_fixed(struct phy_device *phydev); int genphy_read_status(struct phy_device *phydev); int genphy_suspend(struct phy_device *phydev); int genphy_resume(struct phy_device *phydev); int genphy_loopback(struct phy_device *phydev, bool enable); int genphy_soft_reset(struct phy_device *phydev); static inline int genphy_config_aneg(struct phy_device *phydev) { return __genphy_config_aneg(phydev, false); } static inline int genphy_no_ack_interrupt(struct phy_device *phydev) { return 0; } static inline int genphy_no_config_intr(struct phy_device *phydev) { return 0; } int genphy_read_mmd_unsupported(struct phy_device *phdev, int devad, u16 regnum); int genphy_write_mmd_unsupported(struct phy_device *phdev, int devnum, u16 regnum, u16 val); /* Clause 37 */ int genphy_c37_config_aneg(struct phy_device *phydev); int genphy_c37_read_status(struct phy_device *phydev); /* Clause 45 PHY */ int genphy_c45_restart_aneg(struct phy_device *phydev); int genphy_c45_check_and_restart_aneg(struct phy_device *phydev, bool restart); int genphy_c45_aneg_done(struct phy_device *phydev); int genphy_c45_read_link(struct phy_device *phydev); int genphy_c45_read_lpa(struct phy_device *phydev); int genphy_c45_read_pma(struct phy_device *phydev); int genphy_c45_pma_setup_forced(struct phy_device *phydev); int genphy_c45_an_config_aneg(struct phy_device *phydev); int genphy_c45_an_disable_aneg(struct phy_device *phydev); int genphy_c45_read_mdix(struct phy_device *phydev); int genphy_c45_pma_read_abilities(struct phy_device *phydev); int genphy_c45_read_status(struct phy_device *phydev); int genphy_c45_config_aneg(struct phy_device *phydev); /* Generic C45 PHY driver */ extern struct phy_driver genphy_c45_driver; /* The gen10g_* functions are the old Clause 45 stub */ int gen10g_config_aneg(struct phy_device *phydev); static inline int phy_read_status(struct phy_device *phydev) { if (!phydev->drv) return -EIO; if (phydev->drv->read_status) return phydev->drv->read_status(phydev); else return genphy_read_status(phydev); } void phy_driver_unregister(struct phy_driver *drv); void phy_drivers_unregister(struct phy_driver *drv, int n); int phy_driver_register(struct phy_driver *new_driver, struct module *owner); int phy_drivers_register(struct phy_driver *new_driver, int n, struct module *owner); void phy_state_machine(struct work_struct *work); void phy_queue_state_machine(struct phy_device *phydev, unsigned long jiffies); void phy_mac_interrupt(struct phy_device *phydev); void phy_start_machine(struct phy_device *phydev); void phy_stop_machine(struct phy_device *phydev); void phy_ethtool_ksettings_get(struct phy_device *phydev, struct ethtool_link_ksettings *cmd); int phy_ethtool_ksettings_set(struct phy_device *phydev, const struct ethtool_link_ksettings *cmd); int phy_mii_ioctl(struct phy_device *phydev, struct ifreq *ifr, int cmd); int phy_do_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_do_ioctl_running(struct net_device *dev, struct ifreq *ifr, int cmd); int phy_disable_interrupts(struct phy_device *phydev); void phy_request_interrupt(struct phy_device *phydev); void phy_free_interrupt(struct phy_device *phydev); void phy_print_status(struct phy_device *phydev); int phy_set_max_speed(struct phy_device *phydev, u32 max_speed); void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode); void phy_advertise_supported(struct phy_device *phydev); void phy_support_sym_pause(struct phy_device *phydev); void phy_support_asym_pause(struct phy_device *phydev); void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx, bool autoneg); void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx); bool phy_validate_pause(struct phy_device *phydev, struct ethtool_pauseparam *pp); void phy_get_pause(struct phy_device *phydev, bool *tx_pause, bool *rx_pause); s32 phy_get_internal_delay(struct phy_device *phydev, struct device *dev, const int *delay_values, int size, bool is_rx); void phy_resolve_pause(unsigned long *local_adv, unsigned long *partner_adv, bool *tx_pause, bool *rx_pause); int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); int phy_register_fixup_for_id(const char *bus_id, int (*run)(struct phy_device *)); int phy_register_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask, int (*run)(struct phy_device *)); int phy_unregister_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask); int phy_unregister_fixup_for_id(const char *bus_id); int phy_unregister_fixup_for_uid(u32 phy_uid, u32 phy_uid_mask); int phy_init_eee(struct phy_device *phydev, bool clk_stop_enable); int phy_get_eee_err(struct phy_device *phydev); int phy_ethtool_set_eee(struct phy_device *phydev, struct ethtool_eee *data); int phy_ethtool_get_eee(struct phy_device *phydev, struct ethtool_eee *data); int phy_ethtool_set_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); void phy_ethtool_get_wol(struct phy_device *phydev, struct ethtool_wolinfo *wol); int phy_ethtool_get_link_ksettings(struct net_device *ndev, struct ethtool_link_ksettings *cmd); int phy_ethtool_set_link_ksettings(struct net_device *ndev, const struct ethtool_link_ksettings *cmd); int phy_ethtool_nway_reset(struct net_device *ndev); int phy_package_join(struct phy_device *phydev, int addr, size_t priv_size); void phy_package_leave(struct phy_device *phydev); int devm_phy_package_join(struct device *dev, struct phy_device *phydev, int addr, size_t priv_size); #if IS_ENABLED(CONFIG_PHYLIB) int __init mdio_bus_init(void); void mdio_bus_exit(void); #endif int phy_ethtool_get_strings(struct phy_device *phydev, u8 *data); int phy_ethtool_get_sset_count(struct phy_device *phydev); int phy_ethtool_get_stats(struct phy_device *phydev, struct ethtool_stats *stats, u64 *data); static inline int phy_package_read(struct phy_device *phydev, u32 regnum) { struct phy_package_shared *shared = phydev->shared; if (!shared) return -EIO; return mdiobus_read(phydev->mdio.bus, shared->addr, regnum); } static inline int __phy_package_read(struct phy_device *phydev, u32 regnum) { struct phy_package_shared *shared = phydev->shared; if (!shared) return -EIO; return __mdiobus_read(phydev->mdio.bus, shared->addr, regnum); } static inline int phy_package_write(struct phy_device *phydev, u32 regnum, u16 val) { struct phy_package_shared *shared = phydev->shared; if (!shared) return -EIO; return mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); } static inline int __phy_package_write(struct phy_device *phydev, u32 regnum, u16 val) { struct phy_package_shared *shared = phydev->shared; if (!shared) return -EIO; return __mdiobus_write(phydev->mdio.bus, shared->addr, regnum, val); } static inline bool __phy_package_set_once(struct phy_device *phydev, unsigned int b) { struct phy_package_shared *shared = phydev->shared; if (!shared) return false; return !test_and_set_bit(b, &shared->flags); } static inline bool phy_package_init_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_INIT_DONE); } static inline bool phy_package_probe_once(struct phy_device *phydev) { return __phy_package_set_once(phydev, PHY_SHARED_F_PROBE_DONE); } extern struct bus_type mdio_bus_type; struct mdio_board_info { const char *bus_id; char modalias[MDIO_NAME_SIZE]; int mdio_addr; const void *platform_data; }; #if IS_ENABLED(CONFIG_MDIO_DEVICE) int mdiobus_register_board_info(const struct mdio_board_info *info, unsigned int n); #else static inline int mdiobus_register_board_info(const struct mdio_board_info *i, unsigned int n) { return 0; } #endif /** * phy_module_driver() - Helper macro for registering PHY drivers * @__phy_drivers: array of PHY drivers to register * @__count: Numbers of members in array * * Helper macro for PHY drivers which do not do anything special in module * init/exit. Each module may only use this macro once, and calling it * replaces module_init() and module_exit(). */ #define phy_module_driver(__phy_drivers, __count) \ static int __init phy_module_init(void) \ { \ return phy_drivers_register(__phy_drivers, __count, THIS_MODULE); \ } \ module_init(phy_module_init); \ static void __exit phy_module_exit(void) \ { \ phy_drivers_unregister(__phy_drivers, __count); \ } \ module_exit(phy_module_exit) #define module_phy_driver(__phy_drivers) \ phy_module_driver(__phy_drivers, ARRAY_SIZE(__phy_drivers)) bool phy_driver_is_genphy(struct phy_device *phydev); bool phy_driver_is_genphy_10g(struct phy_device *phydev); #endif /* __PHY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_DMA_MAPPING_H #define _ASM_X86_DMA_MAPPING_H /* * IOMMU interface. See Documentation/core-api/dma-api-howto.rst and * Documentation/core-api/dma-api.rst for documentation. */ #include <linux/scatterlist.h> #include <asm/io.h> #include <asm/swiotlb.h> extern int iommu_merge; extern int panic_on_overflow; extern const struct dma_map_ops *dma_ops; static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { return dma_ops; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BSEARCH_H #define _LINUX_BSEARCH_H #include <linux/types.h> static __always_inline void *__inline_bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { const char *pivot; int result; while (num > 0) { pivot = base + (num >> 1) * size; result = cmp(key, pivot); if (result == 0) return (void *)pivot; if (result > 0) { base = pivot + size; num--; } num >>= 1; } return NULL; } extern void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp); #endif /* _LINUX_BSEARCH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 /* * Performance events x86 architecture header * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2009 Jaswinder Singh Rajput * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> * Copyright (C) 2009 Google, Inc., Stephane Eranian * * For licencing details see kernel-base/COPYING */ #include <linux/perf_event.h> #include <asm/intel_ds.h> /* To enable MSR tracing please use the generic trace points. */ /* * | NHM/WSM | SNB | * register ------------------------------- * | HT | no HT | HT | no HT | *----------------------------------------- * offcore | core | core | cpu | core | * lbr_sel | core | core | cpu | core | * ld_lat | cpu | core | cpu | core | *----------------------------------------- * * Given that there is a small number of shared regs, * we can pre-allocate their slot in the per-cpu * per-core reg tables. */ enum extra_reg_type { EXTRA_REG_NONE = -1, /* not used */ EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ EXTRA_REG_LBR = 2, /* lbr_select */ EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ EXTRA_REG_FE = 4, /* fe_* */ EXTRA_REG_MAX /* number of entries needed */ }; struct event_constraint { union { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; u64 idxmsk64; }; u64 code; u64 cmask; int weight; int overlap; int flags; unsigned int size; }; static inline bool constraint_match(struct event_constraint *c, u64 ecode) { return ((ecode & c->cmask) - c->code) <= (u64)c->size; } /* * struct hw_perf_event.flags flags */ #define PERF_X86_EVENT_PEBS_LDLAT 0x0001 /* ld+ldlat data address sampling */ #define PERF_X86_EVENT_PEBS_ST 0x0002 /* st data address sampling */ #define PERF_X86_EVENT_PEBS_ST_HSW 0x0004 /* haswell style datala, store */ #define PERF_X86_EVENT_PEBS_LD_HSW 0x0008 /* haswell style datala, load */ #define PERF_X86_EVENT_PEBS_NA_HSW 0x0010 /* haswell style datala, unknown */ #define PERF_X86_EVENT_EXCL 0x0020 /* HT exclusivity on counter */ #define PERF_X86_EVENT_DYNAMIC 0x0040 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0080 /* grant rdpmc permission */ #define PERF_X86_EVENT_EXCL_ACCT 0x0100 /* accounted EXCL event */ #define PERF_X86_EVENT_AUTO_RELOAD 0x0200 /* use PEBS auto-reload */ #define PERF_X86_EVENT_LARGE_PEBS 0x0400 /* use large PEBS */ #define PERF_X86_EVENT_PEBS_VIA_PT 0x0800 /* use PT buffer for PEBS */ #define PERF_X86_EVENT_PAIR 0x1000 /* Large Increment per Cycle */ #define PERF_X86_EVENT_LBR_SELECT 0x2000 /* Save/Restore MSR_LBR_SELECT */ #define PERF_X86_EVENT_TOPDOWN 0x4000 /* Count Topdown slots/metrics events */ static inline bool is_topdown_count(struct perf_event *event) { return event->hw.flags & PERF_X86_EVENT_TOPDOWN; } static inline bool is_metric_event(struct perf_event *event) { u64 config = event->attr.config; return ((config & ARCH_PERFMON_EVENTSEL_EVENT) == 0) && ((config & INTEL_ARCH_EVENT_MASK) >= INTEL_TD_METRIC_RETIRING) && ((config & INTEL_ARCH_EVENT_MASK) <= INTEL_TD_METRIC_MAX); } static inline bool is_slots_event(struct perf_event *event) { return (event->attr.config & INTEL_ARCH_EVENT_MASK) == INTEL_TD_SLOTS; } static inline bool is_topdown_event(struct perf_event *event) { return is_metric_event(event) || is_slots_event(event); } struct amd_nb { int nb_id; /* NorthBridge id */ int refcnt; /* reference count */ struct perf_event *owners[X86_PMC_IDX_MAX]; struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) #define PEBS_PMI_AFTER_EACH_RECORD BIT_ULL(60) #define PEBS_OUTPUT_OFFSET 61 #define PEBS_OUTPUT_MASK (3ull << PEBS_OUTPUT_OFFSET) #define PEBS_OUTPUT_PT (1ull << PEBS_OUTPUT_OFFSET) #define PEBS_VIA_PT_MASK (PEBS_OUTPUT_PT | PEBS_PMI_AFTER_EACH_RECORD) /* * Flags PEBS can handle without an PMI. * * TID can only be handled by flushing at context switch. * REGS_USER can be handled for events limited to ring 3. * */ #define LARGE_PEBS_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER | \ PERF_SAMPLE_PERIOD) #define PEBS_GP_REGS \ ((1ULL << PERF_REG_X86_AX) | \ (1ULL << PERF_REG_X86_BX) | \ (1ULL << PERF_REG_X86_CX) | \ (1ULL << PERF_REG_X86_DX) | \ (1ULL << PERF_REG_X86_DI) | \ (1ULL << PERF_REG_X86_SI) | \ (1ULL << PERF_REG_X86_SP) | \ (1ULL << PERF_REG_X86_BP) | \ (1ULL << PERF_REG_X86_IP) | \ (1ULL << PERF_REG_X86_FLAGS) | \ (1ULL << PERF_REG_X86_R8) | \ (1ULL << PERF_REG_X86_R9) | \ (1ULL << PERF_REG_X86_R10) | \ (1ULL << PERF_REG_X86_R11) | \ (1ULL << PERF_REG_X86_R12) | \ (1ULL << PERF_REG_X86_R13) | \ (1ULL << PERF_REG_X86_R14) | \ (1ULL << PERF_REG_X86_R15)) /* * Per register state. */ struct er_account { raw_spinlock_t lock; /* per-core: protect structure */ u64 config; /* extra MSR config */ u64 reg; /* extra MSR number */ atomic_t ref; /* reference count */ }; /* * Per core/cpu state * * Used to coordinate shared registers between HT threads or * among events on a single PMU. */ struct intel_shared_regs { struct er_account regs[EXTRA_REG_MAX]; int refcnt; /* per-core: #HT threads */ unsigned core_id; /* per-core: core id */ }; enum intel_excl_state_type { INTEL_EXCL_UNUSED = 0, /* counter is unused */ INTEL_EXCL_SHARED = 1, /* counter can be used by both threads */ INTEL_EXCL_EXCLUSIVE = 2, /* counter can be used by one thread only */ }; struct intel_excl_states { enum intel_excl_state_type state[X86_PMC_IDX_MAX]; bool sched_started; /* true if scheduling has started */ }; struct intel_excl_cntrs { raw_spinlock_t lock; struct intel_excl_states states[2]; union { u16 has_exclusive[2]; u32 exclusive_present; }; int refcnt; /* per-core: #HT threads */ unsigned core_id; /* per-core: core id */ }; struct x86_perf_task_context; #define MAX_LBR_ENTRIES 32 enum { LBR_FORMAT_32 = 0x00, LBR_FORMAT_LIP = 0x01, LBR_FORMAT_EIP = 0x02, LBR_FORMAT_EIP_FLAGS = 0x03, LBR_FORMAT_EIP_FLAGS2 = 0x04, LBR_FORMAT_INFO = 0x05, LBR_FORMAT_TIME = 0x06, LBR_FORMAT_MAX_KNOWN = LBR_FORMAT_TIME, }; enum { X86_PERF_KFREE_SHARED = 0, X86_PERF_KFREE_EXCL = 1, X86_PERF_KFREE_MAX }; struct cpu_hw_events { /* * Generic x86 PMC bits */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long running[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; int n_events; /* the # of events in the below arrays */ int n_added; /* the # last events in the below arrays; they've never been enabled yet */ int n_txn; /* the # last events in the below arrays; added in the current transaction */ int n_txn_pair; int n_txn_metric; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ struct event_constraint *event_constraint[X86_PMC_IDX_MAX]; int n_excl; /* the number of exclusive events */ unsigned int txn_flags; int is_fake; /* * Intel DebugStore bits */ struct debug_store *ds; void *ds_pebs_vaddr; void *ds_bts_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; int n_pebs_via_pt; int pebs_output; /* Current super set of events hardware configuration */ u64 pebs_data_cfg; u64 active_pebs_data_cfg; int pebs_record_size; /* * Intel LBR bits */ int lbr_users; int lbr_pebs_users; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; union { struct er_account *lbr_sel; struct er_account *lbr_ctl; }; u64 br_sel; void *last_task_ctx; int last_log_id; int lbr_select; void *lbr_xsave; /* * Intel host/guest exclude bits */ u64 intel_ctrl_guest_mask; u64 intel_ctrl_host_mask; struct perf_guest_switch_msr guest_switch_msrs[X86_PMC_IDX_MAX]; /* * Intel checkpoint mask */ u64 intel_cp_status; /* * manage shared (per-core, per-cpu) registers * used on Intel NHM/WSM/SNB */ struct intel_shared_regs *shared_regs; /* * manage exclusive counter access between hyperthread */ struct event_constraint *constraint_list; /* in enable order */ struct intel_excl_cntrs *excl_cntrs; int excl_thread_id; /* 0 or 1 */ /* * SKL TSX_FORCE_ABORT shadow */ u64 tfa_shadow; /* * Perf Metrics */ /* number of accepted metrics events */ int n_metric; /* * AMD specific bits */ struct amd_nb *amd_nb; /* Inverted mask of bits to clear in the perf_ctr ctrl registers */ u64 perf_ctr_virt_mask; int n_pair; /* Large increment events */ void *kfree_on_online[X86_PERF_KFREE_MAX]; struct pmu *pmu; }; #define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) { \ { .idxmsk64 = (n) }, \ .code = (c), \ .size = (e) - (c), \ .cmask = (m), \ .weight = (w), \ .overlap = (o), \ .flags = f, \ } #define __EVENT_CONSTRAINT(c, n, m, w, o, f) \ __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f) #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) /* * The constraint_match() function only works for 'simple' event codes * and not for extended (AMD64_EVENTSEL_EVENT) events codes. */ #define EVENT_CONSTRAINT_RANGE(c, e, n, m) \ __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0) #define INTEL_EXCLEVT_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ 0, PERF_X86_EVENT_EXCL) /* * The overlap flag marks event constraints with overlapping counter * masks. This is the case if the counter mask of such an event is not * a subset of any other counter mask of a constraint with an equal or * higher weight, e.g.: * * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); * * The event scheduler may not select the correct counter in the first * cycle because it needs to know which subsequent events will be * scheduled. It may fail to schedule the events then. So we set the * overlap flag for such constraints to give the scheduler a hint which * events to select for counter rescheduling. * * Care must be taken as the rescheduling algorithm is O(n!) which * will increase scheduling cycles for an over-committed system * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros * and its counter masks must be kept at a minimum. */ #define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1, 0) /* * Constraint on the Event code. */ #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) /* * Constraint on a range of Event codes */ #define INTEL_EVENT_CONSTRAINT_RANGE(c, e, n) \ EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT) /* * Constraint on the Event code + UMask + fixed-mask * * filter mask to validate fixed counter events. * the following filters disqualify for fixed counters: * - inv * - edge * - cnt-mask * - in_tx * - in_tx_checkpointed * The other filters are supported by fixed counters. * The any-thread option is supported starting with v3. */ #define FIXED_EVENT_FLAGS (X86_RAW_EVENT_MASK|HSW_IN_TX|HSW_IN_TX_CHECKPOINTED) #define FIXED_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, (1ULL << (32+n)), FIXED_EVENT_FLAGS) /* * The special metric counters do not actually exist. They are calculated from * the combination of the FxCtr3 + MSR_PERF_METRICS. * * The special metric counters are mapped to a dummy offset for the scheduler. * The sharing between multiple users of the same metric without multiplexing * is not allowed, even though the hardware supports that in principle. */ #define METRIC_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, (1ULL << (INTEL_PMC_IDX_METRIC_BASE + n)), \ INTEL_ARCH_EVENT_MASK) /* * Constraint on the Event code + UMask */ #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) /* Constraint on specific umask bit only + event */ #define INTEL_UBIT_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|(c)) /* Like UEVENT_CONSTRAINT, but match flags too */ #define INTEL_FLAGS_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS) #define INTEL_EXCLUEVT_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ HWEIGHT(n), 0, PERF_X86_EVENT_EXCL) #define INTEL_PLD_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) #define INTEL_PST_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) /* Event constraint, but match on all event flags too. */ #define INTEL_FLAGS_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) #define INTEL_FLAGS_EVENT_CONSTRAINT_RANGE(c, e, n) \ EVENT_CONSTRAINT_RANGE(c, e, n, ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS) /* Check only flags, but allow all event/umask */ #define INTEL_ALL_EVENT_CONSTRAINT(code, n) \ EVENT_CONSTRAINT(code, n, X86_ALL_EVENT_FLAGS) /* Check flags and event code, and set the HSW store flag */ #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_ST(code, n) \ __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) /* Check flags and event code, and set the HSW load flag */ #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD(code, n) \ __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_LD_RANGE(code, end, n) \ __EVENT_CONSTRAINT_RANGE(code, end, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) #define INTEL_FLAGS_EVENT_CONSTRAINT_DATALA_XLD(code, n) \ __EVENT_CONSTRAINT(code, n, \ ARCH_PERFMON_EVENTSEL_EVENT|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, \ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) /* Check flags and event code/umask, and set the HSW store flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_ST(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST_HSW) #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XST(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, \ PERF_X86_EVENT_PEBS_ST_HSW|PERF_X86_EVENT_EXCL) /* Check flags and event code/umask, and set the HSW load flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_LD(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LD_HSW) #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_XLD(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, \ PERF_X86_EVENT_PEBS_LD_HSW|PERF_X86_EVENT_EXCL) /* Check flags and event code/umask, and set the HSW N/A flag */ #define INTEL_FLAGS_UEVENT_CONSTRAINT_DATALA_NA(code, n) \ __EVENT_CONSTRAINT(code, n, \ INTEL_ARCH_EVENT_MASK|X86_ALL_EVENT_FLAGS, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_NA_HSW) /* * We define the end marker as having a weight of -1 * to enable blacklisting of events using a counter bitmask * of zero and thus a weight of zero. * The end marker has a weight that cannot possibly be * obtained from counting the bits in the bitmask. */ #define EVENT_CONSTRAINT_END { .weight = -1 } /* * Check for end marker with weight == -1 */ #define for_each_event_constraint(e, c) \ for ((e) = (c); (e)->weight != -1; (e)++) /* * Extra registers for specific events. * * Some events need large masks and require external MSRs. * Those extra MSRs end up being shared for all events on * a PMU and sometimes between PMU of sibling HT threads. * In either case, the kernel needs to handle conflicting * accesses to those extra, shared, regs. The data structure * to manage those registers is stored in cpu_hw_event. */ struct extra_reg { unsigned int event; unsigned int msr; u64 config_mask; u64 valid_mask; int idx; /* per_xxx->regs[] reg index */ bool extra_msr_access; }; #define EVENT_EXTRA_REG(e, ms, m, vm, i) { \ .event = (e), \ .msr = (ms), \ .config_mask = (m), \ .valid_mask = (vm), \ .idx = EXTRA_REG_##i, \ .extra_msr_access = true, \ } #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) #define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \ ARCH_PERFMON_EVENTSEL_UMASK, vm, idx) #define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \ INTEL_UEVENT_EXTRA_REG(c, \ MSR_PEBS_LD_LAT_THRESHOLD, \ 0xffff, \ LDLAT) #define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) union perf_capabilities { struct { u64 lbr_format:6; u64 pebs_trap:1; u64 pebs_arch_reg:1; u64 pebs_format:4; u64 smm_freeze:1; /* * PMU supports separate counter range for writing * values > 32bit. */ u64 full_width_write:1; u64 pebs_baseline:1; u64 perf_metrics:1; u64 pebs_output_pt_available:1; u64 anythread_deprecated:1; }; u64 capabilities; }; struct x86_pmu_quirk { struct x86_pmu_quirk *next; void (*func)(void); }; union x86_pmu_config { struct { u64 event:8, umask:8, usr:1, os:1, edge:1, pc:1, interrupt:1, __reserved1:1, en:1, inv:1, cmask:8, event2:4, __reserved2:4, go:1, ho:1; } bits; u64 value; }; #define X86_CONFIG(args...) ((union x86_pmu_config){.bits = {args}}).value enum { x86_lbr_exclusive_lbr, x86_lbr_exclusive_bts, x86_lbr_exclusive_pt, x86_lbr_exclusive_max, }; /* * struct x86_pmu - generic x86 pmu */ struct x86_pmu { /* * Generic x86 PMC bits */ const char *name; int version; int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); void (*add)(struct perf_event *); void (*del)(struct perf_event *); void (*read)(struct perf_event *event); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; unsigned perfctr; int (*addr_offset)(int index, bool eventsel); int (*rdpmc_index)(int index); u64 (*event_map)(int); int max_events; int num_counters; int num_counters_fixed; int cntval_bits; u64 cntval_mask; union { unsigned long events_maskl; unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; }; int events_mask_len; int apic; u64 max_period; struct event_constraint * (*get_event_constraints)(struct cpu_hw_events *cpuc, int idx, struct perf_event *event); void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); void (*start_scheduling)(struct cpu_hw_events *cpuc); void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); void (*stop_scheduling)(struct cpu_hw_events *cpuc); struct event_constraint *event_constraints; struct x86_pmu_quirk *quirks; int perfctr_second_write; u64 (*limit_period)(struct perf_event *event, u64 l); /* PMI handler bits */ unsigned int late_ack :1, enabled_ack :1, counter_freezing :1; /* * sysfs attrs */ int attr_rdpmc_broken; int attr_rdpmc; struct attribute **format_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); const struct attribute_group **attr_update; unsigned long attr_freeze_on_smi; /* * CPU Hotplug hooks */ int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); void (*check_microcode)(void); void (*sched_task)(struct perf_event_context *ctx, bool sched_in); /* * Intel Arch Perfmon v2+ */ u64 intel_ctrl; union perf_capabilities intel_cap; /* * Intel DebugStore bits */ unsigned int bts :1, bts_active :1, pebs :1, pebs_active :1, pebs_broken :1, pebs_prec_dist :1, pebs_no_tlb :1, pebs_no_isolation :1; int pebs_record_size; int pebs_buffer_size; int max_pebs_events; void (*drain_pebs)(struct pt_regs *regs, struct perf_sample_data *data); struct event_constraint *pebs_constraints; void (*pebs_aliases)(struct perf_event *event); unsigned long large_pebs_flags; u64 rtm_abort_event; /* * Intel LBR */ unsigned int lbr_tos, lbr_from, lbr_to, lbr_info, lbr_nr; /* LBR base regs and size */ union { u64 lbr_sel_mask; /* LBR_SELECT valid bits */ u64 lbr_ctl_mask; /* LBR_CTL valid bits */ }; union { const int *lbr_sel_map; /* lbr_select mappings */ int *lbr_ctl_map; /* LBR_CTL mappings */ }; bool lbr_double_abort; /* duplicated lbr aborts */ bool lbr_pt_coexist; /* (LBR|BTS) may coexist with PT */ /* * Intel Architectural LBR CPUID Enumeration */ unsigned int lbr_depth_mask:8; unsigned int lbr_deep_c_reset:1; unsigned int lbr_lip:1; unsigned int lbr_cpl:1; unsigned int lbr_filter:1; unsigned int lbr_call_stack:1; unsigned int lbr_mispred:1; unsigned int lbr_timed_lbr:1; unsigned int lbr_br_type:1; void (*lbr_reset)(void); void (*lbr_read)(struct cpu_hw_events *cpuc); void (*lbr_save)(void *ctx); void (*lbr_restore)(void *ctx); /* * Intel PT/LBR/BTS are exclusive */ atomic_t lbr_exclusive[x86_lbr_exclusive_max]; /* * Intel perf metrics */ u64 (*update_topdown_event)(struct perf_event *event); int (*set_topdown_event_period)(struct perf_event *event); /* * perf task context (i.e. struct perf_event_context::task_ctx_data) * switch helper to bridge calls from perf/core to perf/x86. * See struct pmu::swap_task_ctx() usage for examples; */ void (*swap_task_ctx)(struct perf_event_context *prev, struct perf_event_context *next); /* * AMD bits */ unsigned int amd_nb_constraints : 1; u64 perf_ctr_pair_en; /* * Extra registers for events */ struct extra_reg *extra_regs; unsigned int flags; /* * Intel host/guest support (KVM) */ struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); /* * Check period value for PERF_EVENT_IOC_PERIOD ioctl. */ int (*check_period) (struct perf_event *event, u64 period); int (*aux_output_match) (struct perf_event *event); }; struct x86_perf_task_context_opt { int lbr_callstack_users; int lbr_stack_state; int log_id; }; struct x86_perf_task_context { u64 lbr_sel; int tos; int valid_lbrs; struct x86_perf_task_context_opt opt; struct lbr_entry lbr[MAX_LBR_ENTRIES]; }; struct x86_perf_task_context_arch_lbr { struct x86_perf_task_context_opt opt; struct lbr_entry entries[]; }; /* * Add padding to guarantee the 64-byte alignment of the state buffer. * * The structure is dynamically allocated. The size of the LBR state may vary * based on the number of LBR registers. * * Do not put anything after the LBR state. */ struct x86_perf_task_context_arch_lbr_xsave { struct x86_perf_task_context_opt opt; union { struct xregs_state xsave; struct { struct fxregs_state i387; struct xstate_header header; struct arch_lbr_state lbr; } __attribute__ ((packed, aligned (XSAVE_ALIGNMENT))); }; }; #define x86_add_quirk(func_) \ do { \ static struct x86_pmu_quirk __quirk __initdata = { \ .func = func_, \ }; \ __quirk.next = x86_pmu.quirks; \ x86_pmu.quirks = &__quirk; \ } while (0) /* * x86_pmu flags */ #define PMU_FL_NO_HT_SHARING 0x1 /* no hyper-threading resource sharing */ #define PMU_FL_HAS_RSP_1 0x2 /* has 2 equivalent offcore_rsp regs */ #define PMU_FL_EXCL_CNTRS 0x4 /* has exclusive counter requirements */ #define PMU_FL_EXCL_ENABLED 0x8 /* exclusive counter active */ #define PMU_FL_PEBS_ALL 0x10 /* all events are valid PEBS events */ #define PMU_FL_TFA 0x20 /* deal with TSX force abort */ #define PMU_FL_PAIR 0x40 /* merge counters for large incr. events */ #define EVENT_VAR(_id) event_attr_##_id #define EVENT_PTR(_id) &event_attr_##_id.attr.attr #define EVENT_ATTR(_name, _id) \ static struct perf_pmu_events_attr EVENT_VAR(_id) = { \ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ .id = PERF_COUNT_HW_##_id, \ .event_str = NULL, \ }; #define EVENT_ATTR_STR(_name, v, str) \ static struct perf_pmu_events_attr event_attr_##v = { \ .attr = __ATTR(_name, 0444, events_sysfs_show, NULL), \ .id = 0, \ .event_str = str, \ }; #define EVENT_ATTR_STR_HT(_name, v, noht, ht) \ static struct perf_pmu_events_ht_attr event_attr_##v = { \ .attr = __ATTR(_name, 0444, events_ht_sysfs_show, NULL),\ .id = 0, \ .event_str_noht = noht, \ .event_str_ht = ht, \ } struct pmu *x86_get_pmu(unsigned int cpu); extern struct x86_pmu x86_pmu __read_mostly; static __always_inline struct x86_perf_task_context_opt *task_context_opt(void *ctx) { if (static_cpu_has(X86_FEATURE_ARCH_LBR)) return &((struct x86_perf_task_context_arch_lbr *)ctx)->opt; return &((struct x86_perf_task_context *)ctx)->opt; } static inline bool x86_pmu_has_lbr_callstack(void) { return x86_pmu.lbr_sel_map && x86_pmu.lbr_sel_map[PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] > 0; } DECLARE_PER_CPU(struct cpu_hw_events, cpu_hw_events); int x86_perf_event_set_period(struct perf_event *event); /* * Generalized hw caching related hw_event table, filled * in on a per model basis. A value of 0 means * 'not supported', -1 means 'hw_event makes no sense on * this CPU', any other value means the raw hw_event * ID. */ #define C(x) PERF_COUNT_HW_CACHE_##x extern u64 __read_mostly hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; extern u64 __read_mostly hw_cache_extra_regs [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX]; u64 x86_perf_event_update(struct perf_event *event); static inline unsigned int x86_pmu_config_addr(int index) { return x86_pmu.eventsel + (x86_pmu.addr_offset ? x86_pmu.addr_offset(index, true) : index); } static inline unsigned int x86_pmu_event_addr(int index) { return x86_pmu.perfctr + (x86_pmu.addr_offset ? x86_pmu.addr_offset(index, false) : index); } static inline int x86_pmu_rdpmc_index(int index) { return x86_pmu.rdpmc_index ? x86_pmu.rdpmc_index(index) : index; } int x86_add_exclusive(unsigned int what); void x86_del_exclusive(unsigned int what); int x86_reserve_hardware(void); void x86_release_hardware(void); int x86_pmu_max_precise(void); void hw_perf_lbr_event_destroy(struct perf_event *event); int x86_setup_perfctr(struct perf_event *event); int x86_pmu_hw_config(struct perf_event *event); void x86_pmu_disable_all(void); static inline bool is_counter_pair(struct hw_perf_event *hwc) { return hwc->flags & PERF_X86_EVENT_PAIR; } static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, u64 enable_mask) { u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); if (hwc->extra_reg.reg) wrmsrl(hwc->extra_reg.reg, hwc->extra_reg.config); /* * Add enabled Merge event on next counter * if large increment event being enabled on this counter */ if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(hwc->idx + 1), x86_pmu.perf_ctr_pair_en); wrmsrl(hwc->config_base, (hwc->config | enable_mask) & ~disable_mask); } void x86_pmu_enable_all(int added); int perf_assign_events(struct event_constraint **constraints, int n, int wmin, int wmax, int gpmax, int *assign); int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign); void x86_pmu_stop(struct perf_event *event, int flags); static inline void x86_pmu_disable_event(struct perf_event *event) { u64 disable_mask = __this_cpu_read(cpu_hw_events.perf_ctr_virt_mask); struct hw_perf_event *hwc = &event->hw; wrmsrl(hwc->config_base, hwc->config & ~disable_mask); if (is_counter_pair(hwc)) wrmsrl(x86_pmu_config_addr(hwc->idx + 1), 0); } void x86_pmu_enable_event(struct perf_event *event); int x86_pmu_handle_irq(struct pt_regs *regs); extern struct event_constraint emptyconstraint; extern struct event_constraint unconstrained; static inline bool kernel_ip(unsigned long ip) { #ifdef CONFIG_X86_32 return ip > PAGE_OFFSET; #else return (long)ip < 0; #endif } /* * Not all PMUs provide the right context information to place the reported IP * into full context. Specifically segment registers are typically not * supplied. * * Assuming the address is a linear address (it is for IBS), we fake the CS and * vm86 mode using the known zero-based code segment and 'fix up' the registers * to reflect this. * * Intel PEBS/LBR appear to typically provide the effective address, nothing * much we can do about that but pray and treat it like a linear address. */ static inline void set_linear_ip(struct pt_regs *regs, unsigned long ip) { regs->cs = kernel_ip(ip) ? __KERNEL_CS : __USER_CS; if (regs->flags & X86_VM_MASK) regs->flags ^= (PERF_EFLAGS_VM | X86_VM_MASK); regs->ip = ip; } ssize_t x86_event_sysfs_show(char *page, u64 config, u64 event); ssize_t intel_event_sysfs_show(char *page, u64 config); ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); ssize_t events_ht_sysfs_show(struct device *dev, struct device_attribute *attr, char *page); #ifdef CONFIG_CPU_SUP_AMD int amd_pmu_init(void); #else /* CONFIG_CPU_SUP_AMD */ static inline int amd_pmu_init(void) { return 0; } #endif /* CONFIG_CPU_SUP_AMD */ static inline int is_pebs_pt(struct perf_event *event) { return !!(event->hw.flags & PERF_X86_EVENT_PEBS_VIA_PT); } #ifdef CONFIG_CPU_SUP_INTEL static inline bool intel_pmu_has_bts_period(struct perf_event *event, u64 period) { struct hw_perf_event *hwc = &event->hw; unsigned int hw_event, bts_event; if (event->attr.freq) return false; hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); return hw_event == bts_event && period == 1; } static inline bool intel_pmu_has_bts(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; return intel_pmu_has_bts_period(event, hwc->sample_period); } int intel_pmu_save_and_restart(struct perf_event *event); struct event_constraint * x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event); extern int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu); extern void intel_cpuc_finish(struct cpu_hw_events *cpuc); int intel_pmu_init(void); void init_debug_store_on_cpu(int cpu); void fini_debug_store_on_cpu(int cpu); void release_ds_buffers(void); void reserve_ds_buffers(void); void release_lbr_buffers(void); void reserve_lbr_buffers(void); extern struct event_constraint bts_constraint; extern struct event_constraint vlbr_constraint; void intel_pmu_enable_bts(u64 config); void intel_pmu_disable_bts(void); int intel_pmu_drain_bts_buffer(void); extern struct event_constraint intel_core2_pebs_event_constraints[]; extern struct event_constraint intel_atom_pebs_event_constraints[]; extern struct event_constraint intel_slm_pebs_event_constraints[]; extern struct event_constraint intel_glm_pebs_event_constraints[]; extern struct event_constraint intel_glp_pebs_event_constraints[]; extern struct event_constraint intel_nehalem_pebs_event_constraints[]; extern struct event_constraint intel_westmere_pebs_event_constraints[]; extern struct event_constraint intel_snb_pebs_event_constraints[]; extern struct event_constraint intel_ivb_pebs_event_constraints[]; extern struct event_constraint intel_hsw_pebs_event_constraints[]; extern struct event_constraint intel_bdw_pebs_event_constraints[]; extern struct event_constraint intel_skl_pebs_event_constraints[]; extern struct event_constraint intel_icl_pebs_event_constraints[]; struct event_constraint *intel_pebs_constraints(struct perf_event *event); void intel_pmu_pebs_add(struct perf_event *event); void intel_pmu_pebs_del(struct perf_event *event); void intel_pmu_pebs_enable(struct perf_event *event); void intel_pmu_pebs_disable(struct perf_event *event); void intel_pmu_pebs_enable_all(void); void intel_pmu_pebs_disable_all(void); void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); void intel_pmu_auto_reload_read(struct perf_event *event); void intel_pmu_store_pebs_lbrs(struct lbr_entry *lbr); void intel_ds_init(void); void intel_pmu_lbr_swap_task_ctx(struct perf_event_context *prev, struct perf_event_context *next); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); void intel_pmu_lbr_reset_32(void); void intel_pmu_lbr_reset_64(void); void intel_pmu_lbr_add(struct perf_event *event); void intel_pmu_lbr_del(struct perf_event *event); void intel_pmu_lbr_enable_all(bool pmi); void intel_pmu_lbr_disable_all(void); void intel_pmu_lbr_read(void); void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc); void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc); void intel_pmu_lbr_save(void *ctx); void intel_pmu_lbr_restore(void *ctx); void intel_pmu_lbr_init_core(void); void intel_pmu_lbr_init_nhm(void); void intel_pmu_lbr_init_atom(void); void intel_pmu_lbr_init_slm(void); void intel_pmu_lbr_init_snb(void); void intel_pmu_lbr_init_hsw(void); void intel_pmu_lbr_init_skl(void); void intel_pmu_lbr_init_knl(void); void intel_pmu_arch_lbr_init(void); void intel_pmu_pebs_data_source_nhm(void); void intel_pmu_pebs_data_source_skl(bool pmem); int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); int intel_bts_interrupt(void); void intel_bts_enable_local(void); void intel_bts_disable_local(void); int p4_pmu_init(void); int p6_pmu_init(void); int knc_pmu_init(void); static inline int is_ht_workaround_enabled(void) { return !!(x86_pmu.flags & PMU_FL_EXCL_ENABLED); } #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) { } static inline void release_ds_buffers(void) { } static inline void release_lbr_buffers(void) { } static inline void reserve_lbr_buffers(void) { } static inline int intel_pmu_init(void) { return 0; } static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { return 0; } static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc) { } static inline int is_ht_workaround_enabled(void) { return 0; } #endif /* CONFIG_CPU_SUP_INTEL */ #if ((defined CONFIG_CPU_SUP_CENTAUR) || (defined CONFIG_CPU_SUP_ZHAOXIN)) int zhaoxin_pmu_init(void); #else static inline int zhaoxin_pmu_init(void) { return 0; } #endif /*CONFIG_CPU_SUP_CENTAUR or CONFIG_CPU_SUP_ZHAOXIN*/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. NET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Ethernet handlers. * * Version: @(#)eth.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Relocated to include/linux where it belongs by Alan Cox * <gw4pts@gw4pts.ampr.org> */ #ifndef _LINUX_ETHERDEVICE_H #define _LINUX_ETHERDEVICE_H #include <linux/if_ether.h> #include <linux/netdevice.h> #include <linux/random.h> #include <linux/crc32.h> #include <asm/unaligned.h> #include <asm/bitsperlong.h> #ifdef __KERNEL__ struct device; int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr); unsigned char *arch_get_platform_mac_address(void); int nvmem_get_mac_address(struct device *dev, void *addrbuf); u32 eth_get_headlen(const struct net_device *dev, void *data, unsigned int len); __be16 eth_type_trans(struct sk_buff *skb, struct net_device *dev); extern const struct header_ops eth_header_ops; int eth_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, const void *daddr, const void *saddr, unsigned len); int eth_header_parse(const struct sk_buff *skb, unsigned char *haddr); int eth_header_cache(const struct neighbour *neigh, struct hh_cache *hh, __be16 type); void eth_header_cache_update(struct hh_cache *hh, const struct net_device *dev, const unsigned char *haddr); __be16 eth_header_parse_protocol(const struct sk_buff *skb); int eth_prepare_mac_addr_change(struct net_device *dev, void *p); void eth_commit_mac_addr_change(struct net_device *dev, void *p); int eth_mac_addr(struct net_device *dev, void *p); int eth_validate_addr(struct net_device *dev); struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define alloc_etherdev(sizeof_priv) alloc_etherdev_mq(sizeof_priv, 1) #define alloc_etherdev_mq(sizeof_priv, count) alloc_etherdev_mqs(sizeof_priv, count, count) struct net_device *devm_alloc_etherdev_mqs(struct device *dev, int sizeof_priv, unsigned int txqs, unsigned int rxqs); #define devm_alloc_etherdev(dev, sizeof_priv) devm_alloc_etherdev_mqs(dev, sizeof_priv, 1, 1) struct sk_buff *eth_gro_receive(struct list_head *head, struct sk_buff *skb); int eth_gro_complete(struct sk_buff *skb, int nhoff); /* Reserved Ethernet Addresses per IEEE 802.1Q */ static const u8 eth_reserved_addr_base[ETH_ALEN] __aligned(2) = { 0x01, 0x80, 0xc2, 0x00, 0x00, 0x00 }; #define eth_stp_addr eth_reserved_addr_base /** * is_link_local_ether_addr - Determine if given Ethernet address is link-local * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if address is link local reserved addr (01:80:c2:00:00:0X) per * IEEE 802.1Q 8.6.3 Frame filtering. * * Please note: addr must be aligned to u16. */ static inline bool is_link_local_ether_addr(const u8 *addr) { __be16 *a = (__be16 *)addr; static const __be16 *b = (const __be16 *)eth_reserved_addr_base; static const __be16 m = cpu_to_be16(0xfff0); #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return (((*(const u32 *)addr) ^ (*(const u32 *)b)) | (__force int)((a[2] ^ b[2]) & m)) == 0; #else return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | ((a[2] ^ b[2]) & m)) == 0; #endif } /** * is_zero_ether_addr - Determine if give Ethernet address is all zeros. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is all zeroes. * * Please note: addr must be aligned to u16. */ static inline bool is_zero_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ((*(const u32 *)addr) | (*(const u16 *)(addr + 4))) == 0; #else return (*(const u16 *)(addr + 0) | *(const u16 *)(addr + 2) | *(const u16 *)(addr + 4)) == 0; #endif } /** * is_multicast_ether_addr - Determine if the Ethernet address is a multicast. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a multicast address. * By definition the broadcast address is also a multicast address. */ static inline bool is_multicast_ether_addr(const u8 *addr) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 a = *(const u32 *)addr; #else u16 a = *(const u16 *)addr; #endif #ifdef __BIG_ENDIAN return 0x01 & (a >> ((sizeof(a) * 8) - 8)); #else return 0x01 & a; #endif } static inline bool is_multicast_ether_addr_64bits(const u8 addr[6+2]) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #ifdef __BIG_ENDIAN return 0x01 & ((*(const u64 *)addr) >> 56); #else return 0x01 & (*(const u64 *)addr); #endif #else return is_multicast_ether_addr(addr); #endif } /** * is_local_ether_addr - Determine if the Ethernet address is locally-assigned one (IEEE 802). * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a local address. */ static inline bool is_local_ether_addr(const u8 *addr) { return 0x02 & addr[0]; } /** * is_broadcast_ether_addr - Determine if the Ethernet address is broadcast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is the broadcast address. * * Please note: addr must be aligned to u16. */ static inline bool is_broadcast_ether_addr(const u8 *addr) { return (*(const u16 *)(addr + 0) & *(const u16 *)(addr + 2) & *(const u16 *)(addr + 4)) == 0xffff; } /** * is_unicast_ether_addr - Determine if the Ethernet address is unicast * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a unicast address. */ static inline bool is_unicast_ether_addr(const u8 *addr) { return !is_multicast_ether_addr(addr); } /** * is_valid_ether_addr - Determine if the given Ethernet address is valid * @addr: Pointer to a six-byte array containing the Ethernet address * * Check that the Ethernet address (MAC) is not 00:00:00:00:00:00, is not * a multicast address, and is not FF:FF:FF:FF:FF:FF. * * Return true if the address is valid. * * Please note: addr must be aligned to u16. */ static inline bool is_valid_ether_addr(const u8 *addr) { /* FF:FF:FF:FF:FF:FF is a multicast address so we don't need to * explicitly check for it here. */ return !is_multicast_ether_addr(addr) && !is_zero_ether_addr(addr); } /** * eth_proto_is_802_3 - Determine if a given Ethertype/length is a protocol * @proto: Ethertype/length value to be tested * * Check that the value from the Ethertype/length field is a valid Ethertype. * * Return true if the valid is an 802.3 supported Ethertype. */ static inline bool eth_proto_is_802_3(__be16 proto) { #ifndef __BIG_ENDIAN /* if CPU is little endian mask off bits representing LSB */ proto &= htons(0xFF00); #endif /* cast both to u16 and compare since LSB can be ignored */ return (__force u16)proto >= (__force u16)htons(ETH_P_802_3_MIN); } /** * eth_random_addr - Generate software assigned random Ethernet address * @addr: Pointer to a six-byte array containing the Ethernet address * * Generate a random Ethernet address (MAC) that is not multicast * and has the local assigned bit set. */ static inline void eth_random_addr(u8 *addr) { get_random_bytes(addr, ETH_ALEN); addr[0] &= 0xfe; /* clear multicast bit */ addr[0] |= 0x02; /* set local assignment bit (IEEE802) */ } #define random_ether_addr(addr) eth_random_addr(addr) /** * eth_broadcast_addr - Assign broadcast address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the broadcast address to the given address array. */ static inline void eth_broadcast_addr(u8 *addr) { memset(addr, 0xff, ETH_ALEN); } /** * eth_zero_addr - Assign zero address * @addr: Pointer to a six-byte array containing the Ethernet address * * Assign the zero address to the given address array. */ static inline void eth_zero_addr(u8 *addr) { memset(addr, 0x00, ETH_ALEN); } /** * eth_hw_addr_random - Generate software assigned random Ethernet and * set device flag * @dev: pointer to net_device structure * * Generate a random Ethernet address (MAC) to be used by a net device * and set addr_assign_type so the state can be read by sysfs and be * used by userspace. */ static inline void eth_hw_addr_random(struct net_device *dev) { dev->addr_assign_type = NET_ADDR_RANDOM; eth_random_addr(dev->dev_addr); } /** * eth_hw_addr_crc - Calculate CRC from netdev_hw_addr * @ha: pointer to hardware address * * Calculate CRC from a hardware address as basis for filter hashes. */ static inline u32 eth_hw_addr_crc(struct netdev_hw_addr *ha) { return ether_crc(ETH_ALEN, ha->addr); } /** * ether_addr_copy - Copy an Ethernet address * @dst: Pointer to a six-byte array Ethernet address destination * @src: Pointer to a six-byte array Ethernet address source * * Please note: dst & src must both be aligned to u16. */ static inline void ether_addr_copy(u8 *dst, const u8 *src) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) *(u32 *)dst = *(const u32 *)src; *(u16 *)(dst + 4) = *(const u16 *)(src + 4); #else u16 *a = (u16 *)dst; const u16 *b = (const u16 *)src; a[0] = b[0]; a[1] = b[1]; a[2] = b[2]; #endif } /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to * @src: pointer to net_device to copy dev_addr from * * Copy the Ethernet address from one net_device to another along with * the address attributes (addr_assign_type). */ static inline void eth_hw_addr_inherit(struct net_device *dst, struct net_device *src) { dst->addr_assign_type = src->addr_assign_type; ether_addr_copy(dst->dev_addr, src->dev_addr); } /** * ether_addr_equal - Compare two Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: addr1 & addr2 must both be aligned to u16. */ static inline bool ether_addr_equal(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) u32 fold = ((*(const u32 *)addr1) ^ (*(const u32 *)addr2)) | ((*(const u16 *)(addr1 + 4)) ^ (*(const u16 *)(addr2 + 4))); return fold == 0; #else const u16 *a = (const u16 *)addr1; const u16 *b = (const u16 *)addr2; return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2])) == 0; #endif } /** * ether_addr_equal_64bits - Compare two Ethernet addresses * @addr1: Pointer to an array of 8 bytes * @addr2: Pointer to an other array of 8 bytes * * Compare two Ethernet addresses, returns true if equal, false otherwise. * * The function doesn't need any conditional branches and possibly uses * word memory accesses on CPU allowing cheap unaligned memory reads. * arrays = { byte1, byte2, byte3, byte4, byte5, byte6, pad1, pad2 } * * Please note that alignment of addr1 & addr2 are only guaranteed to be 16 bits. */ static inline bool ether_addr_equal_64bits(const u8 addr1[6+2], const u8 addr2[6+2]) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 u64 fold = (*(const u64 *)addr1) ^ (*(const u64 *)addr2); #ifdef __BIG_ENDIAN return (fold >> 16) == 0; #else return (fold << 16) == 0; #endif #else return ether_addr_equal(addr1, addr2); #endif } /** * ether_addr_equal_unaligned - Compare two not u16 aligned Ethernet addresses * @addr1: Pointer to a six-byte array containing the Ethernet address * @addr2: Pointer other six-byte array containing the Ethernet address * * Compare two Ethernet addresses, returns true if equal * * Please note: Use only when any Ethernet address may not be u16 aligned. */ static inline bool ether_addr_equal_unaligned(const u8 *addr1, const u8 *addr2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return ether_addr_equal(addr1, addr2); #else return memcmp(addr1, addr2, ETH_ALEN) == 0; #endif } /** * ether_addr_equal_masked - Compare two Ethernet addresses with a mask * @addr1: Pointer to a six-byte array containing the 1st Ethernet address * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address * @mask: Pointer to a six-byte array containing the Ethernet address bitmask * * Compare two Ethernet addresses with a mask, returns true if for every bit * set in the bitmask the equivalent bits in the ethernet addresses are equal. * Using a mask with all bits set is a slower ether_addr_equal. */ static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2, const u8 *mask) { int i; for (i = 0; i < ETH_ALEN; i++) { if ((addr1[i] ^ addr2[i]) & mask[i]) return false; } return true; } /** * ether_addr_to_u64 - Convert an Ethernet address into a u64 value. * @addr: Pointer to a six-byte array containing the Ethernet address * * Return a u64 value of the address */ static inline u64 ether_addr_to_u64(const u8 *addr) { u64 u = 0; int i; for (i = 0; i < ETH_ALEN; i++) u = u << 8 | addr[i]; return u; } /** * u64_to_ether_addr - Convert a u64 to an Ethernet address. * @u: u64 to convert to an Ethernet MAC address * @addr: Pointer to a six-byte array to contain the Ethernet address */ static inline void u64_to_ether_addr(u64 u, u8 *addr) { int i; for (i = ETH_ALEN - 1; i >= 0; i--) { addr[i] = u & 0xff; u = u >> 8; } } /** * eth_addr_dec - Decrement the given MAC address * * @addr: Pointer to a six-byte array containing Ethernet address to decrement */ static inline void eth_addr_dec(u8 *addr) { u64 u = ether_addr_to_u64(addr); u--; u64_to_ether_addr(u, addr); } /** * eth_addr_inc() - Increment the given MAC address. * @addr: Pointer to a six-byte array containing Ethernet address to increment. */ static inline void eth_addr_inc(u8 *addr) { u64 u = ether_addr_to_u64(addr); u++; u64_to_ether_addr(u, addr); } /** * is_etherdev_addr - Tell if given Ethernet address belongs to the device. * @dev: Pointer to a device structure * @addr: Pointer to a six-byte array containing the Ethernet address * * Compare passed address with all addresses of the device. Return true if the * address if one of the device addresses. * * Note that this function calls ether_addr_equal_64bits() so take care of * the right padding. */ static inline bool is_etherdev_addr(const struct net_device *dev, const u8 addr[6 + 2]) { struct netdev_hw_addr *ha; bool res = false; rcu_read_lock(); for_each_dev_addr(dev, ha) { res = ether_addr_equal_64bits(addr, ha->addr); if (res) break; } rcu_read_unlock(); return res; } #endif /* __KERNEL__ */ /** * compare_ether_header - Compare two Ethernet headers * @a: Pointer to Ethernet header * @b: Pointer to Ethernet header * * Compare two Ethernet headers, returns 0 if equal. * This assumes that the network header (i.e., IP header) is 4-byte * aligned OR the platform can handle unaligned access. This is the * case for all packets coming into netif_receive_skb or similar * entry points. */ static inline unsigned long compare_ether_header(const void *a, const void *b) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 unsigned long fold; /* * We want to compare 14 bytes: * [a0 ... a13] ^ [b0 ... b13] * Use two long XOR, ORed together, with an overlap of two bytes. * [a0 a1 a2 a3 a4 a5 a6 a7 ] ^ [b0 b1 b2 b3 b4 b5 b6 b7 ] | * [a6 a7 a8 a9 a10 a11 a12 a13] ^ [b6 b7 b8 b9 b10 b11 b12 b13] * This means the [a6 a7] ^ [b6 b7] part is done two times. */ fold = *(unsigned long *)a ^ *(unsigned long *)b; fold |= *(unsigned long *)(a + 6) ^ *(unsigned long *)(b + 6); return fold; #else u32 *a32 = (u32 *)((u8 *)a + 2); u32 *b32 = (u32 *)((u8 *)b + 2); return (*(u16 *)a ^ *(u16 *)b) | (a32[0] ^ b32[0]) | (a32[1] ^ b32[1]) | (a32[2] ^ b32[2]); #endif } /** * eth_skb_pad - Pad buffer to mininum number of octets for Ethernet frame * @skb: Buffer to pad * * An Ethernet frame should have a minimum size of 60 bytes. This function * takes short frames and pads them with zeros up to the 60 byte limit. */ static inline int eth_skb_pad(struct sk_buff *skb) { return skb_put_padto(skb, ETH_ZLEN); } #endif /* _LINUX_ETHERDEVICE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 /* SPDX-License-Identifier: GPL-2.0 */ /* * ioport.h Definitions of routines for detecting, reserving and * allocating system resources. * * Authors: Linus Torvalds */ #ifndef _LINUX_IOPORT_H #define _LINUX_IOPORT_H #ifndef __ASSEMBLY__ #include <linux/compiler.h> #include <linux/types.h> #include <linux/bits.h> /* * Resources are tree-like, allowing * nesting etc.. */ struct resource { resource_size_t start; resource_size_t end; const char *name; unsigned long flags; unsigned long desc; struct resource *parent, *sibling, *child; }; /* * IO resources have these defined flags. * * PCI devices expose these flags to userspace in the "resource" sysfs file, * so don't move them. */ #define IORESOURCE_BITS 0x000000ff /* Bus-specific bits */ #define IORESOURCE_TYPE_BITS 0x00001f00 /* Resource type */ #define IORESOURCE_IO 0x00000100 /* PCI/ISA I/O ports */ #define IORESOURCE_MEM 0x00000200 #define IORESOURCE_REG 0x00000300 /* Register offsets */ #define IORESOURCE_IRQ 0x00000400 #define IORESOURCE_DMA 0x00000800 #define IORESOURCE_BUS 0x00001000 #define IORESOURCE_PREFETCH 0x00002000 /* No side effects */ #define IORESOURCE_READONLY 0x00004000 #define IORESOURCE_CACHEABLE 0x00008000 #define IORESOURCE_RANGELENGTH 0x00010000 #define IORESOURCE_SHADOWABLE 0x00020000 #define IORESOURCE_SIZEALIGN 0x00040000 /* size indicates alignment */ #define IORESOURCE_STARTALIGN 0x00080000 /* start field is alignment */ #define IORESOURCE_MEM_64 0x00100000 #define IORESOURCE_WINDOW 0x00200000 /* forwarded by bridge */ #define IORESOURCE_MUXED 0x00400000 /* Resource is software muxed */ #define IORESOURCE_EXT_TYPE_BITS 0x01000000 /* Resource extended types */ #define IORESOURCE_SYSRAM 0x01000000 /* System RAM (modifier) */ /* IORESOURCE_SYSRAM specific bits. */ #define IORESOURCE_SYSRAM_DRIVER_MANAGED 0x02000000 /* Always detected via a driver. */ #define IORESOURCE_SYSRAM_MERGEABLE 0x04000000 /* Resource can be merged. */ #define IORESOURCE_EXCLUSIVE 0x08000000 /* Userland may not map this resource */ #define IORESOURCE_DISABLED 0x10000000 #define IORESOURCE_UNSET 0x20000000 /* No address assigned yet */ #define IORESOURCE_AUTO 0x40000000 #define IORESOURCE_BUSY 0x80000000 /* Driver has marked this resource busy */ /* I/O resource extended types */ #define IORESOURCE_SYSTEM_RAM (IORESOURCE_MEM|IORESOURCE_SYSRAM) /* PnP IRQ specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IRQ_HIGHEDGE (1<<0) #define IORESOURCE_IRQ_LOWEDGE (1<<1) #define IORESOURCE_IRQ_HIGHLEVEL (1<<2) #define IORESOURCE_IRQ_LOWLEVEL (1<<3) #define IORESOURCE_IRQ_SHAREABLE (1<<4) #define IORESOURCE_IRQ_OPTIONAL (1<<5) /* PnP DMA specific bits (IORESOURCE_BITS) */ #define IORESOURCE_DMA_TYPE_MASK (3<<0) #define IORESOURCE_DMA_8BIT (0<<0) #define IORESOURCE_DMA_8AND16BIT (1<<0) #define IORESOURCE_DMA_16BIT (2<<0) #define IORESOURCE_DMA_MASTER (1<<2) #define IORESOURCE_DMA_BYTE (1<<3) #define IORESOURCE_DMA_WORD (1<<4) #define IORESOURCE_DMA_SPEED_MASK (3<<6) #define IORESOURCE_DMA_COMPATIBLE (0<<6) #define IORESOURCE_DMA_TYPEA (1<<6) #define IORESOURCE_DMA_TYPEB (2<<6) #define IORESOURCE_DMA_TYPEF (3<<6) /* PnP memory I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_MEM_WRITEABLE (1<<0) /* dup: IORESOURCE_READONLY */ #define IORESOURCE_MEM_CACHEABLE (1<<1) /* dup: IORESOURCE_CACHEABLE */ #define IORESOURCE_MEM_RANGELENGTH (1<<2) /* dup: IORESOURCE_RANGELENGTH */ #define IORESOURCE_MEM_TYPE_MASK (3<<3) #define IORESOURCE_MEM_8BIT (0<<3) #define IORESOURCE_MEM_16BIT (1<<3) #define IORESOURCE_MEM_8AND16BIT (2<<3) #define IORESOURCE_MEM_32BIT (3<<3) #define IORESOURCE_MEM_SHADOWABLE (1<<5) /* dup: IORESOURCE_SHADOWABLE */ #define IORESOURCE_MEM_EXPANSIONROM (1<<6) /* PnP I/O specific bits (IORESOURCE_BITS) */ #define IORESOURCE_IO_16BIT_ADDR (1<<0) #define IORESOURCE_IO_FIXED (1<<1) #define IORESOURCE_IO_SPARSE (1<<2) /* PCI ROM control bits (IORESOURCE_BITS) */ #define IORESOURCE_ROM_ENABLE (1<<0) /* ROM is enabled, same as PCI_ROM_ADDRESS_ENABLE */ #define IORESOURCE_ROM_SHADOW (1<<1) /* Use RAM image, not ROM BAR */ /* PCI control bits. Shares IORESOURCE_BITS with above PCI ROM. */ #define IORESOURCE_PCI_FIXED (1<<4) /* Do not move resource */ #define IORESOURCE_PCI_EA_BEI (1<<5) /* BAR Equivalent Indicator */ /* * I/O Resource Descriptors * * Descriptors are used by walk_iomem_res_desc() and region_intersects() * for searching a specific resource range in the iomem table. Assign * a new descriptor when a resource range supports the search interfaces. * Otherwise, resource.desc must be set to IORES_DESC_NONE (0). */ enum { IORES_DESC_NONE = 0, IORES_DESC_CRASH_KERNEL = 1, IORES_DESC_ACPI_TABLES = 2, IORES_DESC_ACPI_NV_STORAGE = 3, IORES_DESC_PERSISTENT_MEMORY = 4, IORES_DESC_PERSISTENT_MEMORY_LEGACY = 5, IORES_DESC_DEVICE_PRIVATE_MEMORY = 6, IORES_DESC_RESERVED = 7, IORES_DESC_SOFT_RESERVED = 8, }; /* * Flags controlling ioremap() behavior. */ enum { IORES_MAP_SYSTEM_RAM = BIT(0), IORES_MAP_ENCRYPTED = BIT(1), }; /* helpers to define resources */ #define DEFINE_RES_NAMED(_start, _size, _name, _flags) \ { \ .start = (_start), \ .end = (_start) + (_size) - 1, \ .name = (_name), \ .flags = (_flags), \ .desc = IORES_DESC_NONE, \ } #define DEFINE_RES_IO_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_IO) #define DEFINE_RES_IO(_start, _size) \ DEFINE_RES_IO_NAMED((_start), (_size), NULL) #define DEFINE_RES_MEM_NAMED(_start, _size, _name) \ DEFINE_RES_NAMED((_start), (_size), (_name), IORESOURCE_MEM) #define DEFINE_RES_MEM(_start, _size) \ DEFINE_RES_MEM_NAMED((_start), (_size), NULL) #define DEFINE_RES_IRQ_NAMED(_irq, _name) \ DEFINE_RES_NAMED((_irq), 1, (_name), IORESOURCE_IRQ) #define DEFINE_RES_IRQ(_irq) \ DEFINE_RES_IRQ_NAMED((_irq), NULL) #define DEFINE_RES_DMA_NAMED(_dma, _name) \ DEFINE_RES_NAMED((_dma), 1, (_name), IORESOURCE_DMA) #define DEFINE_RES_DMA(_dma) \ DEFINE_RES_DMA_NAMED((_dma), NULL) /* PC/ISA/whatever - the normal PC address spaces: IO and memory */ extern struct resource ioport_resource; extern struct resource iomem_resource; extern struct resource *request_resource_conflict(struct resource *root, struct resource *new); extern int request_resource(struct resource *root, struct resource *new); extern int release_resource(struct resource *new); void release_child_resources(struct resource *new); extern void reserve_region_with_split(struct resource *root, resource_size_t start, resource_size_t end, const char *name); extern struct resource *insert_resource_conflict(struct resource *parent, struct resource *new); extern int insert_resource(struct resource *parent, struct resource *new); extern void insert_resource_expand_to_fit(struct resource *root, struct resource *new); extern int remove_resource(struct resource *old); extern void arch_remove_reservations(struct resource *avail); extern int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, resource_size_t (*alignf)(void *, const struct resource *, resource_size_t, resource_size_t), void *alignf_data); struct resource *lookup_resource(struct resource *root, resource_size_t start); int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size); resource_size_t resource_alignment(struct resource *res); static inline resource_size_t resource_size(const struct resource *res) { return res->end - res->start + 1; } static inline unsigned long resource_type(const struct resource *res) { return res->flags & IORESOURCE_TYPE_BITS; } static inline unsigned long resource_ext_type(const struct resource *res) { return res->flags & IORESOURCE_EXT_TYPE_BITS; } /* True iff r1 completely contains r2 */ static inline bool resource_contains(struct resource *r1, struct resource *r2) { if (resource_type(r1) != resource_type(r2)) return false; if (r1->flags & IORESOURCE_UNSET || r2->flags & IORESOURCE_UNSET) return false; return r1->start <= r2->start && r1->end >= r2->end; } /* Convenience shorthand with allocation */ #define request_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), 0) #define request_muxed_region(start,n,name) __request_region(&ioport_resource, (start), (n), (name), IORESOURCE_MUXED) #define __request_mem_region(start,n,name, excl) __request_region(&iomem_resource, (start), (n), (name), excl) #define request_mem_region(start,n,name) __request_region(&iomem_resource, (start), (n), (name), 0) #define request_mem_region_exclusive(start,n,name) \ __request_region(&iomem_resource, (start), (n), (name), IORESOURCE_EXCLUSIVE) #define rename_region(region, newname) do { (region)->name = (newname); } while (0) extern struct resource * __request_region(struct resource *, resource_size_t start, resource_size_t n, const char *name, int flags); /* Compatibility cruft */ #define release_region(start,n) __release_region(&ioport_resource, (start), (n)) #define release_mem_region(start,n) __release_region(&iomem_resource, (start), (n)) extern void __release_region(struct resource *, resource_size_t, resource_size_t); #ifdef CONFIG_MEMORY_HOTREMOVE extern void release_mem_region_adjustable(resource_size_t, resource_size_t); #endif #ifdef CONFIG_MEMORY_HOTPLUG extern void merge_system_ram_resource(struct resource *res); #endif /* Wrappers for managed devices */ struct device; extern int devm_request_resource(struct device *dev, struct resource *root, struct resource *new); extern void devm_release_resource(struct device *dev, struct resource *new); #define devm_request_region(dev,start,n,name) \ __devm_request_region(dev, &ioport_resource, (start), (n), (name)) #define devm_request_mem_region(dev,start,n,name) \ __devm_request_region(dev, &iomem_resource, (start), (n), (name)) extern struct resource * __devm_request_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n, const char *name); #define devm_release_region(dev, start, n) \ __devm_release_region(dev, &ioport_resource, (start), (n)) #define devm_release_mem_region(dev, start, n) \ __devm_release_region(dev, &iomem_resource, (start), (n)) extern void __devm_release_region(struct device *dev, struct resource *parent, resource_size_t start, resource_size_t n); extern int iomem_map_sanity_check(resource_size_t addr, unsigned long size); extern bool iomem_is_exclusive(u64 addr); extern int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)); extern int walk_mem_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); extern int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, u64 end, void *arg, int (*func)(struct resource *, void *)); /* True if any part of r1 overlaps r2 */ static inline bool resource_overlaps(struct resource *r1, struct resource *r2) { return (r1->start <= r2->end && r1->end >= r2->start); } struct resource *devm_request_free_mem_region(struct device *dev, struct resource *base, unsigned long size); struct resource *request_free_mem_region(struct resource *base, unsigned long size, const char *name); #ifdef CONFIG_IO_STRICT_DEVMEM void revoke_devmem(struct resource *res); #else static inline void revoke_devmem(struct resource *res) { }; #endif #endif /* __ASSEMBLY__ */ #endif /* _LINUX_IOPORT_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 /* SPDX-License-Identifier: GPL-2.0 */ /* thread_info.h: common low-level thread information accessors * * Copyright (C) 2002 David Howells (dhowells@redhat.com) * - Incorporating suggestions made by Linus Torvalds */ #ifndef _LINUX_THREAD_INFO_H #define _LINUX_THREAD_INFO_H #include <linux/types.h> #include <linux/bug.h> #include <linux/restart_block.h> #include <linux/errno.h> #ifdef CONFIG_THREAD_INFO_IN_TASK /* * For CONFIG_THREAD_INFO_IN_TASK kernels we need <asm/current.h> for the * definition of current, but for !CONFIG_THREAD_INFO_IN_TASK kernels, * including <asm/current.h> can cause a circular dependency on some platforms. */ #include <asm/current.h> #define current_thread_info() ((struct thread_info *)current) #endif #include <linux/bitops.h> /* * For per-arch arch_within_stack_frames() implementations, defined in * asm/thread_info.h. */ enum { BAD_STACK = -1, NOT_STACK = 0, GOOD_FRAME, GOOD_STACK, }; #include <asm/thread_info.h> #ifdef __KERNEL__ #ifndef arch_set_restart_data #define arch_set_restart_data(restart) do { } while (0) #endif static inline long set_restart_fn(struct restart_block *restart, long (*fn)(struct restart_block *)) { restart->fn = fn; arch_set_restart_data(restart); return -ERESTART_RESTARTBLOCK; } #ifndef THREAD_ALIGN #define THREAD_ALIGN THREAD_SIZE #endif #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions */ static inline void set_ti_thread_flag(struct thread_info *ti, int flag) { set_bit(flag, (unsigned long *)&ti->flags); } static inline void clear_ti_thread_flag(struct thread_info *ti, int flag) { clear_bit(flag, (unsigned long *)&ti->flags); } static inline void update_ti_thread_flag(struct thread_info *ti, int flag, bool value) { if (value) set_ti_thread_flag(ti, flag); else clear_ti_thread_flag(ti, flag); } static inline int test_and_set_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_set_bit(flag, (unsigned long *)&ti->flags); } static inline int test_and_clear_ti_thread_flag(struct thread_info *ti, int flag) { return test_and_clear_bit(flag, (unsigned long *)&ti->flags); } static inline int test_ti_thread_flag(struct thread_info *ti, int flag) { return test_bit(flag, (unsigned long *)&ti->flags); } #define set_thread_flag(flag) \ set_ti_thread_flag(current_thread_info(), flag) #define clear_thread_flag(flag) \ clear_ti_thread_flag(current_thread_info(), flag) #define update_thread_flag(flag, value) \ update_ti_thread_flag(current_thread_info(), flag, value) #define test_and_set_thread_flag(flag) \ test_and_set_ti_thread_flag(current_thread_info(), flag) #define test_and_clear_thread_flag(flag) \ test_and_clear_ti_thread_flag(current_thread_info(), flag) #define test_thread_flag(flag) \ test_ti_thread_flag(current_thread_info(), flag) #define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, const void * const stackend, const void *obj, unsigned long len) { return 0; } #endif #ifdef CONFIG_HARDENED_USERCOPY extern void __check_object_size(const void *ptr, unsigned long n, bool to_user); static __always_inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { if (!__builtin_constant_p(n)) __check_object_size(ptr, n, to_user); } #else static inline void check_object_size(const void *ptr, unsigned long n, bool to_user) { } #endif /* CONFIG_HARDENED_USERCOPY */ extern void __compiletime_error("copy source size is too small") __bad_copy_from(void); extern void __compiletime_error("copy destination size is too small") __bad_copy_to(void); static inline void copy_overflow(int size, unsigned long count) { WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } static __always_inline __must_check bool check_copy_size(const void *addr, size_t bytes, bool is_source) { int sz = __compiletime_object_size(addr); if (unlikely(sz >= 0 && sz < bytes)) { if (!__builtin_constant_p(bytes)) copy_overflow(sz, bytes); else if (is_source) __bad_copy_from(); else __bad_copy_to(); return false; } if (WARN_ON_ONCE(bytes > INT_MAX)) return false; check_object_size(addr, bytes, is_source); return true; } #ifndef arch_setup_new_exec static inline void arch_setup_new_exec(void) { } #endif #endif /* __KERNEL__ */ #endif /* _LINUX_THREAD_INFO_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 /* SPDX-License-Identifier: GPL-2.0-only */ /* * async.h: Asynchronous function calls for boot performance * * (C) Copyright 2009 Intel Corporation * Author: Arjan van de Ven <arjan@linux.intel.com> */ #ifndef __ASYNC_H__ #define __ASYNC_H__ #include <linux/types.h> #include <linux/list.h> #include <linux/numa.h> #include <linux/device.h> typedef u64 async_cookie_t; typedef void (*async_func_t) (void *data, async_cookie_t cookie); struct async_domain { struct list_head pending; unsigned registered:1; }; /* * domain participates in global async_synchronize_full */ #define ASYNC_DOMAIN(_name) \ struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \ .registered = 1 } /* * domain is free to go out of scope as soon as all pending work is * complete, this domain does not participate in async_synchronize_full */ #define ASYNC_DOMAIN_EXCLUSIVE(_name) \ struct async_domain _name = { .pending = LIST_HEAD_INIT(_name.pending), \ .registered = 0 } async_cookie_t async_schedule_node(async_func_t func, void *data, int node); async_cookie_t async_schedule_node_domain(async_func_t func, void *data, int node, struct async_domain *domain); /** * async_schedule - schedule a function for asynchronous execution * @func: function to execute asynchronously * @data: data pointer to pass to the function * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule(async_func_t func, void *data) { return async_schedule_node(func, data, NUMA_NO_NODE); } /** * async_schedule_domain - schedule a function for asynchronous execution within a certain domain * @func: function to execute asynchronously * @data: data pointer to pass to the function * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. * @domain may be used in the async_synchronize_*_domain() functions to * wait within a certain synchronization domain rather than globally. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule_domain(async_func_t func, void *data, struct async_domain *domain) { return async_schedule_node_domain(func, data, NUMA_NO_NODE, domain); } /** * async_schedule_dev - A device specific version of async_schedule * @func: function to execute asynchronously * @dev: device argument to be passed to function * * Returns an async_cookie_t that may be used for checkpointing later. * @dev is used as both the argument for the function and to provide NUMA * context for where to run the function. By doing this we can try to * provide for the best possible outcome by operating on the device on the * CPUs closest to the device. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule_dev(async_func_t func, struct device *dev) { return async_schedule_node(func, dev, dev_to_node(dev)); } /** * async_schedule_dev_domain - A device specific version of async_schedule_domain * @func: function to execute asynchronously * @dev: device argument to be passed to function * @domain: the domain * * Returns an async_cookie_t that may be used for checkpointing later. * @dev is used as both the argument for the function and to provide NUMA * context for where to run the function. By doing this we can try to * provide for the best possible outcome by operating on the device on the * CPUs closest to the device. * @domain may be used in the async_synchronize_*_domain() functions to * wait within a certain synchronization domain rather than globally. * Note: This function may be called from atomic or non-atomic contexts. */ static inline async_cookie_t async_schedule_dev_domain(async_func_t func, struct device *dev, struct async_domain *domain) { return async_schedule_node_domain(func, dev, dev_to_node(dev), domain); } void async_unregister_domain(struct async_domain *domain); extern void async_synchronize_full(void); extern void async_synchronize_full_domain(struct async_domain *domain); extern void async_synchronize_cookie(async_cookie_t cookie); extern void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain *domain); extern bool current_is_async(void); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef __LINUX_OVERFLOW_H #define __LINUX_OVERFLOW_H #include <linux/compiler.h> #include <linux/limits.h> /* * In the fallback code below, we need to compute the minimum and * maximum values representable in a given type. These macros may also * be useful elsewhere, so we provide them outside the * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. * * It would seem more obvious to do something like * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) * * Unfortunately, the middle expressions, strictly speaking, have * undefined behaviour, and at least some versions of gcc warn about * the type_max expression (but not if -fsanitize=undefined is in * effect; in that case, the warning is deferred to runtime...). * * The slightly excessive casting in type_min is to make sure the * macros also produce sensible values for the exotic type _Bool. [The * overflow checkers only almost work for _Bool, but that's * a-feature-not-a-bug, since people shouldn't be doing arithmetic on * _Bools. Besides, the gcc builtins don't allow _Bool* as third * argument.] * * Idea stolen from * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ #define is_signed_type(type) (((type)(-1)) < (type)1) #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_min(T) ((T)((T)-type_max(T)-(T)1)) /* * Avoids triggering -Wtype-limits compilation warning, * while using unsigned data types to check a < 0. */ #define is_non_negative(a) ((a) > 0 || (a) == 0) #define is_negative(a) (!(is_non_negative(a))) /* * Allows for effectively applying __must_check to a macro so we can have * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ static inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } #ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW /* * For simplicity and code hygiene, the fallback code below insists on * a, b and *d having the same type (similar to the min() and max() * macros), whereas gcc's type-generic overflow checkers accept * different types. Hence we don't just make check_add_overflow an * alias for __builtin_add_overflow, but add type checks similar to * below. */ #define check_add_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_add_overflow(__a, __b, __d); \ })) #define check_sub_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_sub_overflow(__a, __b, __d); \ })) #define check_mul_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_mul_overflow(__a, __b, __d); \ })) #else /* Checking for unsigned overflow is relatively easy without causing UB. */ #define __unsigned_add_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = __a + __b; \ *__d < __a; \ }) #define __unsigned_sub_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = __a - __b; \ __a < __b; \ }) /* * If one of a or b is a compile-time constant, this avoids a division. */ #define __unsigned_mul_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = __a * __b; \ __builtin_constant_p(__b) ? \ __b > 0 && __a > type_max(typeof(__a)) / __b : \ __a > 0 && __b > type_max(typeof(__b)) / __a; \ }) /* * For signed types, detecting overflow is much harder, especially if * we want to avoid UB. But the interface of these macros is such that * we must provide a result in *d, and in fact we must produce the * result promised by gcc's builtins, which is simply the possibly * wrapped-around value. Fortunately, we can just formally do the * operations in the widest relevant unsigned type (u64) and then * truncate the result - gcc is smart enough to generate the same code * with and without the (u64) casts. */ /* * Adding two signed integers can overflow only if they have the same * sign, and overflow has happened iff the result has the opposite * sign. */ #define __signed_add_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = (u64)__a + (u64)__b; \ (((~(__a ^ __b)) & (*__d ^ __a)) \ & type_min(typeof(__a))) != 0; \ }) /* * Subtraction is similar, except that overflow can now happen only * when the signs are opposite. In this case, overflow has happened if * the result has the opposite sign of a. */ #define __signed_sub_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = (u64)__a - (u64)__b; \ ((((__a ^ __b)) & (*__d ^ __a)) \ & type_min(typeof(__a))) != 0; \ }) /* * Signed multiplication is rather hard. gcc always follows C99, so * division is truncated towards 0. This means that we can write the * overflow check like this: * * (a > 0 && (b > MAX/a || b < MIN/a)) || * (a < -1 && (b > MIN/a || b < MAX/a) || * (a == -1 && b == MIN) * * The redundant casts of -1 are to silence an annoying -Wtype-limits * (included in -Wextra) warning: When the type is u8 or u16, the * __b_c_e in check_mul_overflow obviously selects * __unsigned_mul_overflow, but unfortunately gcc still parses this * code and warns about the limited range of __b. */ #define __signed_mul_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ typeof(a) __tmax = type_max(typeof(a)); \ typeof(a) __tmin = type_min(typeof(a)); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = (u64)__a * (u64)__b; \ (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ (__b == (typeof(__b))-1 && __a == __tmin); \ }) #define check_add_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_add_overflow(a, b, d), \ __unsigned_add_overflow(a, b, d))) #define check_sub_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_sub_overflow(a, b, d), \ __unsigned_sub_overflow(a, b, d))) #define check_mul_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_mul_overflow(a, b, d), \ __unsigned_mul_overflow(a, b, d))) #endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ /** check_shl_overflow() - Calculate a left-shifted value and check overflow * * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * * Returns true if '*d' cannot hold the result or when 'a << s' doesn't * make sense. Example conditions: * - 'a << s' causes bits to be lost when stored in *d. * - 's' is garbage (e.g. negative) or so large that the result of * 'a << s' is guaranteed to be 0. * - 'a' is negative. * - 'a << s' sets the sign bit, if any, in '*d'. * * '*d' will hold the results of the attempted shift, but is not * considered "safe for use" if false is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ u64 _a_full = _a; \ unsigned int _to_shift = \ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ (*_d >> _to_shift) != _a); \ })) /** * array_size() - Calculate size of 2-dimensional array. * * @a: dimension one * @b: dimension two * * Calculates size of 2-dimensional array: @a * @b. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ static inline __must_check size_t array_size(size_t a, size_t b) { size_t bytes; if (check_mul_overflow(a, b, &bytes)) return SIZE_MAX; return bytes; } /** * array3_size() - Calculate size of 3-dimensional array. * * @a: dimension one * @b: dimension two * @c: dimension three * * Calculates size of 3-dimensional array: @a * @b * @c. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) { size_t bytes; if (check_mul_overflow(a, b, &bytes)) return SIZE_MAX; if (check_mul_overflow(bytes, c, &bytes)) return SIZE_MAX; return bytes; } /* * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for * struct_size() below. */ static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c) { size_t bytes; if (check_mul_overflow(a, b, &bytes)) return SIZE_MAX; if (check_add_overflow(bytes, c, &bytes)) return SIZE_MAX; return bytes; } /** * struct_size() - Calculate size of structure with trailing array. * @p: Pointer to the structure. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure @p followed by an * array of @count number of @member elements. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size(p, member, count) \ __ab_c_size(count, \ sizeof(*(p)->member) + __must_be_array((p)->member),\ sizeof(*(p))) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. * * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. * * Calculates size of a flexible array of @count number of @member * elements, at the end of structure @p. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define flex_array_size(p, member, count) \ array_size(count, \ sizeof(*(p)->member) + __must_be_array((p)->member)) #endif /* __LINUX_OVERFLOW_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MEMREMAP_H_ #define _LINUX_MEMREMAP_H_ #include <linux/range.h> #include <linux/ioport.h> #include <linux/percpu-refcount.h> struct resource; struct device; /** * struct vmem_altmap - pre-allocated storage for vmemmap_populate * @base_pfn: base of the entire dev_pagemap mapping * @reserve: pages mapped, but reserved for driver use (relative to @base) * @free: free pages set aside in the mapping for memmap storage * @align: pages reserved to meet allocation alignments * @alloc: track pages consumed, private to vmemmap_populate() */ struct vmem_altmap { const unsigned long base_pfn; const unsigned long end_pfn; const unsigned long reserve; unsigned long free; unsigned long align; unsigned long alloc; }; /* * Specialize ZONE_DEVICE memory into multiple types each having differents * usage. * * MEMORY_DEVICE_PRIVATE: * Device memory that is not directly addressable by the CPU: CPU can neither * read nor write private memory. In this case, we do still have struct pages * backing the device memory. Doing so simplifies the implementation, but it is * important to remember that there are certain points at which the struct page * must be treated as an opaque object, rather than a "normal" struct page. * * A more complete discussion of unaddressable memory may be found in * include/linux/hmm.h and Documentation/vm/hmm.rst. * * MEMORY_DEVICE_FS_DAX: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. In support of coordinating page * pinning vs other operations MEMORY_DEVICE_FS_DAX arranges for a * wakeup event whenever a page is unpinned and becomes idle. This * wakeup is used to coordinate physical address space management (ex: * fs truncate/hole punch) vs pinned pages (ex: device dma). * * MEMORY_DEVICE_GENERIC: * Host memory that has similar access semantics as System RAM i.e. DMA * coherent and supports page pinning. This is for example used by DAX devices * that expose memory using a character device. * * MEMORY_DEVICE_PCI_P2PDMA: * Device memory residing in a PCI BAR intended for use with Peer-to-Peer * transactions. */ enum memory_type { /* 0 is reserved to catch uninitialized type fields */ MEMORY_DEVICE_PRIVATE = 1, MEMORY_DEVICE_FS_DAX, MEMORY_DEVICE_GENERIC, MEMORY_DEVICE_PCI_P2PDMA, }; struct dev_pagemap_ops { /* * Called once the page refcount reaches 1. (ZONE_DEVICE pages never * reach 0 refcount unless there is a refcount bug. This allows the * device driver to implement its own memory management.) */ void (*page_free)(struct page *page); /* * Transition the refcount in struct dev_pagemap to the dead state. */ void (*kill)(struct dev_pagemap *pgmap); /* * Wait for refcount in struct dev_pagemap to be idle and reap it. */ void (*cleanup)(struct dev_pagemap *pgmap); /* * Used for private (un-addressable) device memory only. Must migrate * the page back to a CPU accessible page. */ vm_fault_t (*migrate_to_ram)(struct vm_fault *vmf); }; #define PGMAP_ALTMAP_VALID (1 << 0) /** * struct dev_pagemap - metadata for ZONE_DEVICE mappings * @altmap: pre-allocated/reserved memory for vmemmap allocations * @ref: reference count that pins the devm_memremap_pages() mapping * @internal_ref: internal reference if @ref is not provided by the caller * @done: completion for @internal_ref * @type: memory type: see MEMORY_* in memory_hotplug.h * @flags: PGMAP_* flags to specify defailed behavior * @ops: method table * @owner: an opaque pointer identifying the entity that manages this * instance. Used by various helpers to make sure that no * foreign ZONE_DEVICE memory is accessed. * @nr_range: number of ranges to be mapped * @range: range to be mapped when nr_range == 1 * @ranges: array of ranges to be mapped when nr_range > 1 */ struct dev_pagemap { struct vmem_altmap altmap; struct percpu_ref *ref; struct percpu_ref internal_ref; struct completion done; enum memory_type type; unsigned int flags; const struct dev_pagemap_ops *ops; void *owner; int nr_range; union { struct range range; struct range ranges[0]; }; }; static inline struct vmem_altmap *pgmap_altmap(struct dev_pagemap *pgmap) { if (pgmap->flags & PGMAP_ALTMAP_VALID) return &pgmap->altmap; return NULL; } #ifdef CONFIG_ZONE_DEVICE void *memremap_pages(struct dev_pagemap *pgmap, int nid); void memunmap_pages(struct dev_pagemap *pgmap); void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap); void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap); struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap); bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn); unsigned long vmem_altmap_offset(struct vmem_altmap *altmap); void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns); unsigned long memremap_compat_align(void); #else static inline void *devm_memremap_pages(struct device *dev, struct dev_pagemap *pgmap) { /* * Fail attempts to call devm_memremap_pages() without * ZONE_DEVICE support enabled, this requires callers to fall * back to plain devm_memremap() based on config */ WARN_ON_ONCE(1); return ERR_PTR(-ENXIO); } static inline void devm_memunmap_pages(struct device *dev, struct dev_pagemap *pgmap) { } static inline struct dev_pagemap *get_dev_pagemap(unsigned long pfn, struct dev_pagemap *pgmap) { return NULL; } static inline bool pgmap_pfn_valid(struct dev_pagemap *pgmap, unsigned long pfn) { return false; } static inline unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) { return 0; } static inline void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) { } /* when memremap_pages() is disabled all archs can remap a single page */ static inline unsigned long memremap_compat_align(void) { return PAGE_SIZE; } #endif /* CONFIG_ZONE_DEVICE */ static inline void put_dev_pagemap(struct dev_pagemap *pgmap) { if (pgmap) percpu_ref_put(pgmap->ref); } #endif /* _LINUX_MEMREMAP_H_ */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 // SPDX-License-Identifier: GPL-2.0 /* * Fast batching percpu counters. */ #include <linux/percpu_counter.h> #include <linux/mutex.h> #include <linux/init.h> #include <linux/cpu.h> #include <linux/module.h> #include <linux/debugobjects.h> #ifdef CONFIG_HOTPLUG_CPU static LIST_HEAD(percpu_counters); static DEFINE_SPINLOCK(percpu_counters_lock); #endif #ifdef CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER static const struct debug_obj_descr percpu_counter_debug_descr; static bool percpu_counter_fixup_free(void *addr, enum debug_obj_state state) { struct percpu_counter *fbc = addr; switch (state) { case ODEBUG_STATE_ACTIVE: percpu_counter_destroy(fbc); debug_object_free(fbc, &percpu_counter_debug_descr); return true; default: return false; } } static const struct debug_obj_descr percpu_counter_debug_descr = { .name = "percpu_counter", .fixup_free = percpu_counter_fixup_free, }; static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { debug_object_init(fbc, &percpu_counter_debug_descr); debug_object_activate(fbc, &percpu_counter_debug_descr); } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { debug_object_deactivate(fbc, &percpu_counter_debug_descr); debug_object_free(fbc, &percpu_counter_debug_descr); } #else /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ static inline void debug_percpu_counter_activate(struct percpu_counter *fbc) { } static inline void debug_percpu_counter_deactivate(struct percpu_counter *fbc) { } #endif /* CONFIG_DEBUG_OBJECTS_PERCPU_COUNTER */ void percpu_counter_set(struct percpu_counter *fbc, s64 amount) { int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); for_each_possible_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); *pcount = 0; } fbc->count = amount; raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_set); /** * This function is both preempt and irq safe. The former is due to explicit * preemption disable. The latter is guaranteed by the fact that the slow path * is explicitly protected by an irq-safe spinlock whereas the fast patch uses * this_cpu_add which is irq-safe by definition. Hence there is no need muck * with irq state before calling this one */ void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch) { s64 count; preempt_disable(); count = __this_cpu_read(*fbc->counters) + amount; if (abs(count) >= batch) { unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); fbc->count += count; __this_cpu_sub(*fbc->counters, count - amount); raw_spin_unlock_irqrestore(&fbc->lock, flags); } else { this_cpu_add(*fbc->counters, amount); } preempt_enable(); } EXPORT_SYMBOL(percpu_counter_add_batch); /* * For percpu_counter with a big batch, the devication of its count could * be big, and there is requirement to reduce the deviation, like when the * counter's batch could be runtime decreased to get a better accuracy, * which can be achieved by running this sync function on each CPU. */ void percpu_counter_sync(struct percpu_counter *fbc) { unsigned long flags; s64 count; raw_spin_lock_irqsave(&fbc->lock, flags); count = __this_cpu_read(*fbc->counters); fbc->count += count; __this_cpu_sub(*fbc->counters, count); raw_spin_unlock_irqrestore(&fbc->lock, flags); } EXPORT_SYMBOL(percpu_counter_sync); /* * Add up all the per-cpu counts, return the result. This is a more accurate * but much slower version of percpu_counter_read_positive() */ s64 __percpu_counter_sum(struct percpu_counter *fbc) { s64 ret; int cpu; unsigned long flags; raw_spin_lock_irqsave(&fbc->lock, flags); ret = fbc->count; for_each_online_cpu(cpu) { s32 *pcount = per_cpu_ptr(fbc->counters, cpu); ret += *pcount; } raw_spin_unlock_irqrestore(&fbc->lock, flags); return ret; } EXPORT_SYMBOL(__percpu_counter_sum); int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp, struct lock_class_key *key) { unsigned long flags __maybe_unused; raw_spin_lock_init(&fbc->lock); lockdep_set_class(&fbc->lock, key); fbc->count = amount; fbc->counters = alloc_percpu_gfp(s32, gfp); if (!fbc->counters) return -ENOMEM; debug_percpu_counter_activate(fbc); #ifdef CONFIG_HOTPLUG_CPU INIT_LIST_HEAD(&fbc->list); spin_lock_irqsave(&percpu_counters_lock, flags); list_add(&fbc->list, &percpu_counters); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif return 0; } EXPORT_SYMBOL(__percpu_counter_init); void percpu_counter_destroy(struct percpu_counter *fbc) { unsigned long flags __maybe_unused; if (!fbc->counters) return; debug_percpu_counter_deactivate(fbc); #ifdef CONFIG_HOTPLUG_CPU spin_lock_irqsave(&percpu_counters_lock, flags); list_del(&fbc->list); spin_unlock_irqrestore(&percpu_counters_lock, flags); #endif free_percpu(fbc->counters); fbc->counters = NULL; } EXPORT_SYMBOL(percpu_counter_destroy); int percpu_counter_batch __read_mostly = 32; EXPORT_SYMBOL(percpu_counter_batch); static int compute_batch_value(unsigned int cpu) { int nr = num_online_cpus(); percpu_counter_batch = max(32, nr*2); return 0; } static int percpu_counter_cpu_dead(unsigned int cpu) { #ifdef CONFIG_HOTPLUG_CPU struct percpu_counter *fbc; compute_batch_value(cpu); spin_lock_irq(&percpu_counters_lock); list_for_each_entry(fbc, &percpu_counters, list) { s32 *pcount; raw_spin_lock(&fbc->lock); pcount = per_cpu_ptr(fbc->counters, cpu); fbc->count += *pcount; *pcount = 0; raw_spin_unlock(&fbc->lock); } spin_unlock_irq(&percpu_counters_lock); #endif return 0; } /* * Compare counter against given value. * Return 1 if greater, 0 if equal and -1 if less */ int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch) { s64 count; count = percpu_counter_read(fbc); /* Check to see if rough count will be sufficient for comparison */ if (abs(count - rhs) > (batch * num_online_cpus())) { if (count > rhs) return 1; else return -1; } /* Need to use precise count */ count = percpu_counter_sum(fbc); if (count > rhs) return 1; else if (count < rhs) return -1; else return 0; } EXPORT_SYMBOL(__percpu_counter_compare); static int __init percpu_counter_startup(void) { int ret; ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "lib/percpu_cnt:online", compute_batch_value, NULL); WARN_ON(ret < 0); ret = cpuhp_setup_state_nocalls(CPUHP_PERCPU_CNT_DEAD, "lib/percpu_cnt:dead", NULL, percpu_counter_cpu_dead); WARN_ON(ret < 0); return 0; } module_init(percpu_counter_startup);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 /* SPDX-License-Identifier: GPL-2.0 */ /* * Portions of this file * Copyright (C) 2018 Intel Corporation */ #ifndef __NET_WIRELESS_NL80211_H #define __NET_WIRELESS_NL80211_H #include "core.h" int nl80211_init(void); void nl80211_exit(void); void *nl80211hdr_put(struct sk_buff *skb, u32 portid, u32 seq, int flags, u8 cmd); bool nl80211_put_sta_rate(struct sk_buff *msg, struct rate_info *info, int attr); static inline u64 wdev_id(struct wireless_dev *wdev) { return (u64)wdev->identifier | ((u64)wiphy_to_rdev(wdev->wiphy)->wiphy_idx << 32); } int nl80211_prepare_wdev_dump(struct netlink_callback *cb, struct cfg80211_registered_device **rdev, struct wireless_dev **wdev); int nl80211_parse_chandef(struct cfg80211_registered_device *rdev, struct genl_info *info, struct cfg80211_chan_def *chandef); int nl80211_parse_random_mac(struct nlattr **attrs, u8 *mac_addr, u8 *mac_addr_mask); void nl80211_notify_wiphy(struct cfg80211_registered_device *rdev, enum nl80211_commands cmd); void nl80211_notify_iface(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_commands cmd); void nl80211_send_scan_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); struct sk_buff *nl80211_build_scan_msg(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, bool aborted); void nl80211_send_scan_msg(struct cfg80211_registered_device *rdev, struct sk_buff *msg); void nl80211_send_sched_scan(struct cfg80211_sched_scan_request *req, u32 cmd); void nl80211_common_reg_change_event(enum nl80211_commands cmd_id, struct regulatory_request *request); static inline void nl80211_send_reg_change_event(struct regulatory_request *request) { nl80211_common_reg_change_event(NL80211_CMD_REG_CHANGE, request); } static inline void nl80211_send_wiphy_reg_change_event(struct regulatory_request *request) { nl80211_common_reg_change_event(NL80211_CMD_WIPHY_REG_CHANGE, request); } void nl80211_send_rx_auth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_rx_assoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp, int uapsd_queues, const u8 *req_ies, size_t req_ies_len); void nl80211_send_deauth(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_disassoc(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *buf, size_t len, gfp_t gfp); void nl80211_send_auth_timeout(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, gfp_t gfp); void nl80211_send_assoc_timeout(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, gfp_t gfp); void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_connect_resp_params *params, gfp_t gfp); void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_roam_info *info, gfp_t gfp); void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid); void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); void nl80211_michael_mic_failure(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *addr, enum nl80211_key_type key_type, int key_id, const u8 *tsc, gfp_t gfp); void nl80211_send_beacon_hint_event(struct wiphy *wiphy, struct ieee80211_channel *channel_before, struct ieee80211_channel *channel_after); void nl80211_send_ibss_bssid(struct cfg80211_registered_device *rdev, struct net_device *netdev, const u8 *bssid, gfp_t gfp); int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u32 nlpid, int freq, int sig_dbm, const u8 *buf, size_t len, u32 flags, gfp_t gfp); void nl80211_radar_notify(struct cfg80211_registered_device *rdev, const struct cfg80211_chan_def *chandef, enum nl80211_radar_event event, struct net_device *netdev, gfp_t gfp); void nl80211_send_ap_stopped(struct wireless_dev *wdev); void cfg80211_rdev_free_coalesce(struct cfg80211_registered_device *rdev); /* peer measurement */ int nl80211_pmsr_start(struct sk_buff *skb, struct genl_info *info); int nl80211_pmsr_dump_results(struct sk_buff *skb, struct netlink_callback *cb); #endif /* __NET_WIRELESS_NL80211_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* SPDX-License-Identifier: GPL-2.0 */ /* * The proc filesystem constants/structures */ #ifndef _LINUX_PROC_FS_H #define _LINUX_PROC_FS_H #include <linux/compiler.h> #include <linux/types.h> #include <linux/fs.h> struct proc_dir_entry; struct seq_file; struct seq_operations; enum { /* * All /proc entries using this ->proc_ops instance are never removed. * * If in doubt, ignore this flag. */ #ifdef MODULE PROC_ENTRY_PERMANENT = 0U, #else PROC_ENTRY_PERMANENT = 1U << 0, #endif }; struct proc_ops { unsigned int proc_flags; int (*proc_open)(struct inode *, struct file *); ssize_t (*proc_read)(struct file *, char __user *, size_t, loff_t *); ssize_t (*proc_read_iter)(struct kiocb *, struct iov_iter *); ssize_t (*proc_write)(struct file *, const char __user *, size_t, loff_t *); loff_t (*proc_lseek)(struct file *, loff_t, int); int (*proc_release)(struct inode *, struct file *); __poll_t (*proc_poll)(struct file *, struct poll_table_struct *); long (*proc_ioctl)(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT long (*proc_compat_ioctl)(struct file *, unsigned int, unsigned long); #endif int (*proc_mmap)(struct file *, struct vm_area_struct *); unsigned long (*proc_get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); } __randomize_layout; /* definitions for hide_pid field */ enum proc_hidepid { HIDEPID_OFF = 0, HIDEPID_NO_ACCESS = 1, HIDEPID_INVISIBLE = 2, HIDEPID_NOT_PTRACEABLE = 4, /* Limit pids to only ptraceable pids */ }; /* definitions for proc mount option pidonly */ enum proc_pidonly { PROC_PIDONLY_OFF = 0, PROC_PIDONLY_ON = 1, }; struct proc_fs_info { struct pid_namespace *pid_ns; struct dentry *proc_self; /* For /proc/self */ struct dentry *proc_thread_self; /* For /proc/thread-self */ kgid_t pid_gid; enum proc_hidepid hide_pid; enum proc_pidonly pidonly; }; static inline struct proc_fs_info *proc_sb_info(struct super_block *sb) { return sb->s_fs_info; } #ifdef CONFIG_PROC_FS typedef int (*proc_write_t)(struct file *, char *, size_t); extern void proc_root_init(void); extern void proc_flush_pid(struct pid *); extern struct proc_dir_entry *proc_symlink(const char *, struct proc_dir_entry *, const char *); struct proc_dir_entry *_proc_mkdir(const char *, umode_t, struct proc_dir_entry *, void *, bool); extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *); extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t, struct proc_dir_entry *, void *); extern struct proc_dir_entry *proc_mkdir_mode(const char *, umode_t, struct proc_dir_entry *); struct proc_dir_entry *proc_create_mount_point(const char *name); struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data); #define proc_create_seq_data(name, mode, parent, ops, data) \ proc_create_seq_private(name, mode, parent, ops, 0, data) #define proc_create_seq(name, mode, parent, ops) \ proc_create_seq_private(name, mode, parent, ops, 0, NULL) struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), void *data); #define proc_create_single(name, mode, parent, show) \ proc_create_single_data(name, mode, parent, show, NULL) extern struct proc_dir_entry *proc_create_data(const char *, umode_t, struct proc_dir_entry *, const struct proc_ops *, void *); struct proc_dir_entry *proc_create(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct proc_ops *proc_ops); extern void proc_set_size(struct proc_dir_entry *, loff_t); extern void proc_set_user(struct proc_dir_entry *, kuid_t, kgid_t); extern void *PDE_DATA(const struct inode *); extern void *proc_get_parent_data(const struct inode *); extern void proc_remove(struct proc_dir_entry *); extern void remove_proc_entry(const char *, struct proc_dir_entry *); extern int remove_proc_subtree(const char *, struct proc_dir_entry *); struct proc_dir_entry *proc_create_net_data(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, unsigned int state_size, void *data); #define proc_create_net(name, mode, parent, ops, state_size) \ proc_create_net_data(name, mode, parent, ops, state_size, NULL) struct proc_dir_entry *proc_create_net_single(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), void *data); struct proc_dir_entry *proc_create_net_data_write(const char *name, umode_t mode, struct proc_dir_entry *parent, const struct seq_operations *ops, proc_write_t write, unsigned int state_size, void *data); struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mode, struct proc_dir_entry *parent, int (*show)(struct seq_file *, void *), proc_write_t write, void *data); extern struct pid *tgid_pidfd_to_pid(const struct file *file); struct bpf_iter_aux_info; extern int bpf_iter_init_seq_net(void *priv_data, struct bpf_iter_aux_info *aux); extern void bpf_iter_fini_seq_net(void *priv_data); #ifdef CONFIG_PROC_PID_ARCH_STATUS /* * The architecture which selects CONFIG_PROC_PID_ARCH_STATUS must * provide proc_pid_arch_status() definition. */ int proc_pid_arch_status(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); #endif /* CONFIG_PROC_PID_ARCH_STATUS */ #else /* CONFIG_PROC_FS */ static inline void proc_root_init(void) { } static inline void proc_flush_pid(struct pid *pid) { } static inline struct proc_dir_entry *proc_symlink(const char *name, struct proc_dir_entry *parent,const char *dest) { return NULL;} static inline struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) {return NULL;} static inline struct proc_dir_entry *proc_create_mount_point(const char *name) { return NULL; } static inline struct proc_dir_entry *_proc_mkdir(const char *name, umode_t mode, struct proc_dir_entry *parent, void *data, bool force_lookup) { return NULL; } static inline struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode, struct proc_dir_entry *parent, void *data) { return NULL; } static inline struct proc_dir_entry *proc_mkdir_mode(const char *name, umode_t mode, struct proc_dir_entry *parent) { return NULL; } #define proc_create_seq_private(name, mode, parent, ops, size, data) ({NULL;}) #define proc_create_seq_data(name, mode, parent, ops, data) ({NULL;}) #define proc_create_seq(name, mode, parent, ops) ({NULL;}) #define proc_create_single(name, mode, parent, show) ({NULL;}) #define proc_create_single_data(name, mode, parent, show, data) ({NULL;}) #define proc_create(name, mode, parent, proc_ops) ({NULL;}) #define proc_create_data(name, mode, parent, proc_ops, data) ({NULL;}) static inline void proc_set_size(struct proc_dir_entry *de, loff_t size) {} static inline void proc_set_user(struct proc_dir_entry *de, kuid_t uid, kgid_t gid) {} static inline void *PDE_DATA(const struct inode *inode) {BUG(); return NULL;} static inline void *proc_get_parent_data(const struct inode *inode) { BUG(); return NULL; } static inline void proc_remove(struct proc_dir_entry *de) {} #define remove_proc_entry(name, parent) do {} while (0) static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *parent) { return 0; } #define proc_create_net_data(name, mode, parent, ops, state_size, data) ({NULL;}) #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) static inline struct pid *tgid_pidfd_to_pid(const struct file *file) { return ERR_PTR(-EBADF); } #endif /* CONFIG_PROC_FS */ struct net; static inline struct proc_dir_entry *proc_net_mkdir( struct net *net, const char *name, struct proc_dir_entry *parent) { return _proc_mkdir(name, 0, parent, net, true); } struct ns_common; int open_related_ns(struct ns_common *ns, struct ns_common *(*get_ns)(struct ns_common *ns)); /* get the associated pid namespace for a file in procfs */ static inline struct pid_namespace *proc_pid_ns(struct super_block *sb) { return proc_sb_info(sb)->pid_ns; } bool proc_ns_file(const struct file *file); #endif /* _LINUX_PROC_FS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* SPDX-License-Identifier: GPL-2.0 */ /* Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ #ifndef _LINUX_KALLSYMS_H #define _LINUX_KALLSYMS_H #include <linux/errno.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> #include <asm/sections.h> #define KSYM_NAME_LEN 128 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) struct cred; struct module; static inline int is_kernel_inittext(unsigned long addr) { if (addr >= (unsigned long)_sinittext && addr <= (unsigned long)_einittext) return 1; return 0; } static inline int is_kernel_text(unsigned long addr) { if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || arch_is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) return 1; return in_gate_area_no_mm(addr); } static inline int is_ksym_addr(unsigned long addr) { if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); } static inline void *dereference_symbol_descriptor(void *ptr) { #ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); if (is_ksym_addr((unsigned long)ptr)) return ptr; preempt_disable(); mod = __module_address((unsigned long)ptr); preempt_enable(); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); #endif return ptr; } #ifdef CONFIG_KALLSYMS /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); /* Call a function on each kallsyms symbol in the core kernel */ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); extern int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset); /* Lookup an address. modname is set to NULL if it's in the kernel. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* How and when do we show kallsyms values? */ extern bool kallsyms_show_value(const struct cred *cred); #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) { return 0; } static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data) { return 0; } static inline int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { return 0; } static inline const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return NULL; } static inline int sprint_symbol(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) { return -ERANGE; } static inline bool kallsyms_show_value(const struct cred *cred) { return false; } #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) { printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CPUHOTPLUG_H #define __CPUHOTPLUG_H #include <linux/types.h> /* * CPU-up CPU-down * * BP AP BP AP * * OFFLINE OFFLINE * | ^ * v | * BRINGUP_CPU->AP_OFFLINE BRINGUP_CPU <- AP_IDLE_DEAD (idle thread/play_dead) * | AP_OFFLINE * v (IRQ-off) ,---------------^ * AP_ONLNE | (stop_machine) * | TEARDOWN_CPU <- AP_ONLINE_IDLE * | ^ * v | * AP_ACTIVE AP_ACTIVE */ enum cpuhp_state { CPUHP_INVALID = -1, CPUHP_OFFLINE = 0, CPUHP_CREATE_THREADS, CPUHP_PERF_PREPARE, CPUHP_PERF_X86_PREPARE, CPUHP_PERF_X86_AMD_UNCORE_PREP, CPUHP_PERF_POWER, CPUHP_PERF_SUPERH, CPUHP_X86_HPET_DEAD, CPUHP_X86_APB_DEAD, CPUHP_X86_MCE_DEAD, CPUHP_VIRT_NET_DEAD, CPUHP_SLUB_DEAD, CPUHP_DEBUG_OBJ_DEAD, CPUHP_MM_WRITEBACK_DEAD, CPUHP_MM_VMSTAT_DEAD, CPUHP_SOFTIRQ_DEAD, CPUHP_NET_MVNETA_DEAD, CPUHP_CPUIDLE_DEAD, CPUHP_ARM64_FPSIMD_DEAD, CPUHP_ARM_OMAP_WAKE_DEAD, CPUHP_IRQ_POLL_DEAD, CPUHP_BLOCK_SOFTIRQ_DEAD, CPUHP_ACPI_CPUDRV_DEAD, CPUHP_S390_PFAULT_DEAD, CPUHP_BLK_MQ_DEAD, CPUHP_FS_BUFF_DEAD, CPUHP_PRINTK_DEAD, CPUHP_MM_MEMCQ_DEAD, CPUHP_PERCPU_CNT_DEAD, CPUHP_RADIX_DEAD, CPUHP_PAGE_ALLOC_DEAD, CPUHP_NET_DEV_DEAD, CPUHP_PCI_XGENE_DEAD, CPUHP_IOMMU_INTEL_DEAD, CPUHP_LUSTRE_CFS_DEAD, CPUHP_AP_ARM_CACHE_B15_RAC_DEAD, CPUHP_PADATA_DEAD, CPUHP_WORKQUEUE_PREP, CPUHP_POWER_NUMA_PREPARE, CPUHP_HRTIMERS_PREPARE, CPUHP_PROFILE_PREPARE, CPUHP_X2APIC_PREPARE, CPUHP_SMPCFD_PREPARE, CPUHP_RELAY_PREPARE, CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, CPUHP_XEN_PREPARE, CPUHP_XEN_EVTCHN_PREPARE, CPUHP_ARM_SHMOBILE_SCU_PREPARE, CPUHP_SH_SH3X_PREPARE, CPUHP_NET_FLOW_PREPARE, CPUHP_TOPOLOGY_PREPARE, CPUHP_NET_IUCV_PREPARE, CPUHP_ARM_BL_PREPARE, CPUHP_TRACE_RB_PREPARE, CPUHP_MM_ZS_PREPARE, CPUHP_MM_ZSWP_MEM_PREPARE, CPUHP_MM_ZSWP_POOL_PREPARE, CPUHP_KVM_PPC_BOOK3S_PREPARE, CPUHP_ZCOMP_PREPARE, CPUHP_TIMERS_PREPARE, CPUHP_MIPS_SOC_PREPARE, CPUHP_BP_PREPARE_DYN, CPUHP_BP_PREPARE_DYN_END = CPUHP_BP_PREPARE_DYN + 20, CPUHP_BRINGUP_CPU, CPUHP_AP_IDLE_DEAD, CPUHP_AP_OFFLINE, CPUHP_AP_SCHED_STARTING, CPUHP_AP_RCUTREE_DYING, CPUHP_AP_CPU_PM_STARTING, CPUHP_AP_IRQ_GIC_STARTING, CPUHP_AP_IRQ_HIP04_STARTING, CPUHP_AP_IRQ_ARMADA_XP_STARTING, CPUHP_AP_IRQ_BCM2836_STARTING, CPUHP_AP_IRQ_MIPS_GIC_STARTING, CPUHP_AP_IRQ_RISCV_STARTING, CPUHP_AP_IRQ_SIFIVE_PLIC_STARTING, CPUHP_AP_ARM_MVEBU_COHERENCY, CPUHP_AP_MICROCODE_LOADER, CPUHP_AP_PERF_X86_AMD_UNCORE_STARTING, CPUHP_AP_PERF_X86_STARTING, CPUHP_AP_PERF_X86_AMD_IBS_STARTING, CPUHP_AP_PERF_X86_CQM_STARTING, CPUHP_AP_PERF_X86_CSTATE_STARTING, CPUHP_AP_PERF_XTENSA_STARTING, CPUHP_AP_MIPS_OP_LOONGSON3_STARTING, CPUHP_AP_ARM_SDEI_STARTING, CPUHP_AP_ARM_VFP_STARTING, CPUHP_AP_ARM64_DEBUG_MONITORS_STARTING, CPUHP_AP_PERF_ARM_HW_BREAKPOINT_STARTING, CPUHP_AP_PERF_ARM_ACPI_STARTING, CPUHP_AP_PERF_ARM_STARTING, CPUHP_AP_ARM_L2X0_STARTING, CPUHP_AP_EXYNOS4_MCT_TIMER_STARTING, CPUHP_AP_ARM_ARCH_TIMER_STARTING, CPUHP_AP_ARM_GLOBAL_TIMER_STARTING, CPUHP_AP_JCORE_TIMER_STARTING, CPUHP_AP_ARM_TWD_STARTING, CPUHP_AP_QCOM_TIMER_STARTING, CPUHP_AP_TEGRA_TIMER_STARTING, CPUHP_AP_ARMADA_TIMER_STARTING, CPUHP_AP_MARCO_TIMER_STARTING, CPUHP_AP_MIPS_GIC_TIMER_STARTING, CPUHP_AP_ARC_TIMER_STARTING, CPUHP_AP_RISCV_TIMER_STARTING, CPUHP_AP_CLINT_TIMER_STARTING, CPUHP_AP_CSKY_TIMER_STARTING, CPUHP_AP_TI_GP_TIMER_STARTING, CPUHP_AP_HYPERV_TIMER_STARTING, CPUHP_AP_KVM_STARTING, CPUHP_AP_KVM_ARM_VGIC_INIT_STARTING, CPUHP_AP_KVM_ARM_VGIC_STARTING, CPUHP_AP_KVM_ARM_TIMER_STARTING, /* Must be the last timer callback */ CPUHP_AP_DUMMY_TIMER_STARTING, CPUHP_AP_ARM_XEN_STARTING, CPUHP_AP_ARM_CORESIGHT_STARTING, CPUHP_AP_ARM_CORESIGHT_CTI_STARTING, CPUHP_AP_ARM64_ISNDEP_STARTING, CPUHP_AP_SMPCFD_DYING, CPUHP_AP_X86_TBOOT_DYING, CPUHP_AP_ARM_CACHE_B15_RAC_DYING, CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_AP_ONLINE_IDLE, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, CPUHP_AP_BLK_MQ_ONLINE, CPUHP_AP_ARM_MVEBU_SYNC_CLOCKS, CPUHP_AP_X86_INTEL_EPB_ONLINE, CPUHP_AP_PERF_ONLINE, CPUHP_AP_PERF_X86_ONLINE, CPUHP_AP_PERF_X86_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, CPUHP_AP_PERF_X86_RAPL_ONLINE, CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, CPUHP_AP_PERF_ARM_HISI_DDRC_ONLINE, CPUHP_AP_PERF_ARM_HISI_HHA_ONLINE, CPUHP_AP_PERF_ARM_HISI_L3_ONLINE, CPUHP_AP_PERF_ARM_L2X0_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE, CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE, CPUHP_AP_PERF_ARM_APM_XGENE_ONLINE, CPUHP_AP_PERF_ARM_CAVIUM_TX2_UNCORE_ONLINE, CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_TRACE_IMC_ONLINE, CPUHP_AP_PERF_POWERPC_HV_24x7_ONLINE, CPUHP_AP_PERF_POWERPC_HV_GPCI_ONLINE, CPUHP_AP_WATCHDOG_ONLINE, CPUHP_AP_WORKQUEUE_ONLINE, CPUHP_AP_RCUTREE_ONLINE, CPUHP_AP_BASE_CACHEINFO_ONLINE, CPUHP_AP_ONLINE_DYN, CPUHP_AP_ONLINE_DYN_END = CPUHP_AP_ONLINE_DYN + 30, CPUHP_AP_X86_HPET_ONLINE, CPUHP_AP_X86_KVM_CLK_ONLINE, CPUHP_AP_ACTIVE, CPUHP_ONLINE, }; int __cpuhp_setup_state(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); int __cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, bool invoke, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu), bool multi_instance); /** * cpuhp_setup_state - Setup hotplug state callbacks with calling the callbacks * @state: The state for which the calls are installed * @name: Name of the callback (will be used in debug output) * @startup: startup callback function * @teardown: teardown callback function * * Installs the callback functions and invokes the startup callback on * the present cpus which have already reached the @state. */ static inline int cpuhp_setup_state(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state(state, name, true, startup, teardown, false); } static inline int cpuhp_setup_state_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state_cpuslocked(state, name, true, startup, teardown, false); } /** * cpuhp_setup_state_nocalls - Setup hotplug state callbacks without calling the * callbacks * @state: The state for which the calls are installed * @name: Name of the callback. * @startup: startup callback function * @teardown: teardown callback function * * Same as @cpuhp_setup_state except that no calls are executed are invoked * during installation of this callback. NOP if SMP=n or HOTPLUG_CPU=n. */ static inline int cpuhp_setup_state_nocalls(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state(state, name, false, startup, teardown, false); } static inline int cpuhp_setup_state_nocalls_cpuslocked(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu), int (*teardown)(unsigned int cpu)) { return __cpuhp_setup_state_cpuslocked(state, name, false, startup, teardown, false); } /** * cpuhp_setup_state_multi - Add callbacks for multi state * @state: The state for which the calls are installed * @name: Name of the callback. * @startup: startup callback function * @teardown: teardown callback function * * Sets the internal multi_instance flag and prepares a state to work as a multi * instance callback. No callbacks are invoked at this point. The callbacks are * invoked once an instance for this state are registered via * @cpuhp_state_add_instance or @cpuhp_state_add_instance_nocalls. */ static inline int cpuhp_setup_state_multi(enum cpuhp_state state, const char *name, int (*startup)(unsigned int cpu, struct hlist_node *node), int (*teardown)(unsigned int cpu, struct hlist_node *node)) { return __cpuhp_setup_state(state, name, false, (void *) startup, (void *) teardown, true); } int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); int __cpuhp_state_add_instance_cpuslocked(enum cpuhp_state state, struct hlist_node *node, bool invoke); /** * cpuhp_state_add_instance - Add an instance for a state and invoke startup * callback. * @state: The state for which the instance is installed * @node: The node for this individual state. * * Installs the instance for the @state and invokes the startup callback on * the present cpus which have already reached the @state. The @state must have * been earlier marked as multi-instance by @cpuhp_setup_state_multi. */ static inline int cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_add_instance(state, node, true); } /** * cpuhp_state_add_instance_nocalls - Add an instance for a state without * invoking the startup callback. * @state: The state for which the instance is installed * @node: The node for this individual state. * * Installs the instance for the @state The @state must have been earlier * marked as multi-instance by @cpuhp_setup_state_multi. */ static inline int cpuhp_state_add_instance_nocalls(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_add_instance(state, node, false); } static inline int cpuhp_state_add_instance_nocalls_cpuslocked(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_add_instance_cpuslocked(state, node, false); } void __cpuhp_remove_state(enum cpuhp_state state, bool invoke); void __cpuhp_remove_state_cpuslocked(enum cpuhp_state state, bool invoke); /** * cpuhp_remove_state - Remove hotplug state callbacks and invoke the teardown * @state: The state for which the calls are removed * * Removes the callback functions and invokes the teardown callback on * the present cpus which have already reached the @state. */ static inline void cpuhp_remove_state(enum cpuhp_state state) { __cpuhp_remove_state(state, true); } /** * cpuhp_remove_state_nocalls - Remove hotplug state callbacks without invoking * teardown * @state: The state for which the calls are removed */ static inline void cpuhp_remove_state_nocalls(enum cpuhp_state state) { __cpuhp_remove_state(state, false); } static inline void cpuhp_remove_state_nocalls_cpuslocked(enum cpuhp_state state) { __cpuhp_remove_state_cpuslocked(state, false); } /** * cpuhp_remove_multi_state - Remove hotplug multi state callback * @state: The state for which the calls are removed * * Removes the callback functions from a multi state. This is the reverse of * cpuhp_setup_state_multi(). All instances should have been removed before * invoking this function. */ static inline void cpuhp_remove_multi_state(enum cpuhp_state state) { __cpuhp_remove_state(state, false); } int __cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node, bool invoke); /** * cpuhp_state_remove_instance - Remove hotplug instance from state and invoke * the teardown callback * @state: The state from which the instance is removed * @node: The node for this individual state. * * Removes the instance and invokes the teardown callback on the present cpus * which have already reached the @state. */ static inline int cpuhp_state_remove_instance(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_remove_instance(state, node, true); } /** * cpuhp_state_remove_instance_nocalls - Remove hotplug instance from state * without invoking the reatdown callback * @state: The state from which the instance is removed * @node: The node for this individual state. * * Removes the instance without invoking the teardown callback. */ static inline int cpuhp_state_remove_instance_nocalls(enum cpuhp_state state, struct hlist_node *node) { return __cpuhp_state_remove_instance(state, node, false); } #ifdef CONFIG_SMP void cpuhp_online_idle(enum cpuhp_state state); #else static inline void cpuhp_online_idle(enum cpuhp_state state) { } #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_ICMPV6_H #define _LINUX_ICMPV6_H #include <linux/skbuff.h> #include <linux/ipv6.h> #include <uapi/linux/icmpv6.h> static inline struct icmp6hdr *icmp6_hdr(const struct sk_buff *skb) { return (struct icmp6hdr *)skb_transport_header(skb); } #include <linux/netdevice.h> #if IS_ENABLED(CONFIG_IPV6) typedef void ip6_icmp_send_t(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct in6_addr *force_saddr, const struct inet6_skb_parm *parm); #if IS_BUILTIN(CONFIG_IPV6) static inline void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm) { icmp6_send(skb, type, code, info, NULL, parm); } static inline int inet6_register_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } static inline int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn) { BUILD_BUG_ON(fn != icmp6_send); return 0; } #else extern void __icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, const struct inet6_skb_parm *parm); extern int inet6_register_icmp_sender(ip6_icmp_send_t *fn); extern int inet6_unregister_icmp_sender(ip6_icmp_send_t *fn); #endif static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { __icmpv6_send(skb, type, code, info, IP6CB(skb)); } int ip6_err_gen_icmpv6_unreach(struct sk_buff *skb, int nhs, int type, unsigned int data_len); #if IS_ENABLED(CONFIG_NF_NAT) void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info); #else static inline void icmpv6_ndo_send(struct sk_buff *skb_in, u8 type, u8 code, __u32 info) { struct inet6_skb_parm parm = { 0 }; __icmpv6_send(skb_in, type, code, info, &parm); } #endif #else static inline void icmpv6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } static inline void icmpv6_ndo_send(struct sk_buff *skb, u8 type, u8 code, __u32 info) { } #endif extern int icmpv6_init(void); extern int icmpv6_err_convert(u8 type, u8 code, int *err); extern void icmpv6_cleanup(void); extern void icmpv6_param_prob(struct sk_buff *skb, u8 code, int pos); struct flowi6; struct in6_addr; extern void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6, u8 type, const struct in6_addr *saddr, const struct in6_addr *daddr, int oif); static inline bool icmpv6_is_err(int type) { switch (type) { case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: case ICMPV6_PARAMPROB: return true; } return false; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_ECN_H_ #define _INET_ECN_H_ #include <linux/ip.h> #include <linux/skbuff.h> #include <linux/if_vlan.h> #include <net/inet_sock.h> #include <net/dsfield.h> enum { INET_ECN_NOT_ECT = 0, INET_ECN_ECT_1 = 1, INET_ECN_ECT_0 = 2, INET_ECN_CE = 3, INET_ECN_MASK = 3, }; extern int sysctl_tunnel_ecn_log; static inline int INET_ECN_is_ce(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_CE; } static inline int INET_ECN_is_not_ect(__u8 dsfield) { return (dsfield & INET_ECN_MASK) == INET_ECN_NOT_ECT; } static inline int INET_ECN_is_capable(__u8 dsfield) { return dsfield & INET_ECN_ECT_0; } /* * RFC 3168 9.1.1 * The full-functionality option for ECN encapsulation is to copy the * ECN codepoint of the inside header to the outside header on * encapsulation if the inside header is not-ECT or ECT, and to set the * ECN codepoint of the outside header to ECT(0) if the ECN codepoint of * the inside header is CE. */ static inline __u8 INET_ECN_encapsulate(__u8 outer, __u8 inner) { outer &= ~INET_ECN_MASK; outer |= !INET_ECN_is_ce(inner) ? (inner & INET_ECN_MASK) : INET_ECN_ECT_0; return outer; } static inline void INET_ECN_xmit(struct sock *sk) { inet_sk(sk)->tos |= INET_ECN_ECT_0; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass |= INET_ECN_ECT_0; } static inline void INET_ECN_dontxmit(struct sock *sk) { inet_sk(sk)->tos &= ~INET_ECN_MASK; if (inet6_sk(sk) != NULL) inet6_sk(sk)->tclass &= ~INET_ECN_MASK; } #define IP6_ECN_flow_init(label) do { \ (label) &= ~htonl(INET_ECN_MASK << 20); \ } while (0) #define IP6_ECN_flow_xmit(sk, label) do { \ if (INET_ECN_is_capable(inet6_sk(sk)->tclass)) \ (label) |= htonl(INET_ECN_ECT_0 << 20); \ } while (0) static inline int IP_ECN_set_ce(struct iphdr *iph) { u32 check = (__force u32)iph->check; u32 ecn = (iph->tos + 1) & INET_ECN_MASK; /* * After the last operation we have (in binary): * INET_ECN_NOT_ECT => 01 * INET_ECN_ECT_1 => 10 * INET_ECN_ECT_0 => 11 * INET_ECN_CE => 00 */ if (!(ecn & 2)) return !ecn; /* * The following gives us: * INET_ECN_ECT_1 => check += htons(0xFFFD) * INET_ECN_ECT_0 => check += htons(0xFFFE) */ check += (__force u16)htons(0xFFFB) + (__force u16)htons(ecn); iph->check = (__force __sum16)(check + (check>=0xFFFF)); iph->tos |= INET_ECN_CE; return 1; } static inline int IP_ECN_set_ect1(struct iphdr *iph) { u32 check = (__force u32)iph->check; if ((iph->tos & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; check += (__force u16)htons(0x1); iph->check = (__force __sum16)(check + (check>=0xFFFF)); iph->tos ^= INET_ECN_MASK; return 1; } static inline void IP_ECN_clear(struct iphdr *iph) { iph->tos &= ~INET_ECN_MASK; } static inline void ipv4_copy_dscp(unsigned int dscp, struct iphdr *inner) { dscp &= ~INET_ECN_MASK; ipv4_change_dsfield(inner, INET_ECN_MASK, dscp); } struct ipv6hdr; /* Note: * IP_ECN_set_ce() has to tweak IPV4 checksum when setting CE, * meaning both changes have no effect on skb->csum if/when CHECKSUM_COMPLETE * In IPv6 case, no checksum compensates the change in IPv6 header, * so we have to update skb->csum. */ static inline int IP6_ECN_set_ce(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph))) return 0; from = *(__be32 *)iph; to = from | htonl(INET_ECN_CE << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline int IP6_ECN_set_ect1(struct sk_buff *skb, struct ipv6hdr *iph) { __be32 from, to; if ((ipv6_get_dsfield(iph) & INET_ECN_MASK) != INET_ECN_ECT_0) return 0; from = *(__be32 *)iph; to = from ^ htonl(INET_ECN_MASK << 20); *(__be32 *)iph = to; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from), (__force __wsum)to); return 1; } static inline void ipv6_copy_dscp(unsigned int dscp, struct ipv6hdr *inner) { dscp &= ~INET_ECN_MASK; ipv6_change_dsfield(inner, INET_ECN_MASK, dscp); } static inline int INET_ECN_set_ce(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ce(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ce(skb, ipv6_hdr(skb)); break; } return 0; } static inline int INET_ECN_set_ect1(struct sk_buff *skb) { switch (skb_protocol(skb, true)) { case cpu_to_be16(ETH_P_IP): if (skb_network_header(skb) + sizeof(struct iphdr) <= skb_tail_pointer(skb)) return IP_ECN_set_ect1(ip_hdr(skb)); break; case cpu_to_be16(ETH_P_IPV6): if (skb_network_header(skb) + sizeof(struct ipv6hdr) <= skb_tail_pointer(skb)) return IP6_ECN_set_ect1(skb, ipv6_hdr(skb)); break; } return 0; } /* * RFC 6040 4.2 * To decapsulate the inner header at the tunnel egress, a compliant * tunnel egress MUST set the outgoing ECN field to the codepoint at the * intersection of the appropriate arriving inner header (row) and outer * header (column) in Figure 4 * * +---------+------------------------------------------------+ * |Arriving | Arriving Outer Header | * | Inner +---------+------------+------------+------------+ * | Header | Not-ECT | ECT(0) | ECT(1) | CE | * +---------+---------+------------+------------+------------+ * | Not-ECT | Not-ECT |Not-ECT(!!!)|Not-ECT(!!!)| <drop>(!!!)| * | ECT(0) | ECT(0) | ECT(0) | ECT(1) | CE | * | ECT(1) | ECT(1) | ECT(1) (!) | ECT(1) | CE | * | CE | CE | CE | CE(!!!)| CE | * +---------+---------+------------+------------+------------+ * * Figure 4: New IP in IP Decapsulation Behaviour * * returns 0 on success * 1 if something is broken and should be logged (!!! above) * 2 if packet should be dropped */ static inline int __INET_ECN_decapsulate(__u8 outer, __u8 inner, bool *set_ce) { if (INET_ECN_is_not_ect(inner)) { switch (outer & INET_ECN_MASK) { case INET_ECN_NOT_ECT: return 0; case INET_ECN_ECT_0: case INET_ECN_ECT_1: return 1; case INET_ECN_CE: return 2; } } *set_ce = INET_ECN_is_ce(outer); return 0; } static inline int INET_ECN_decapsulate(struct sk_buff *skb, __u8 outer, __u8 inner) { bool set_ce = false; int rc; rc = __INET_ECN_decapsulate(outer, inner, &set_ce); if (!rc) { if (set_ce) INET_ECN_set_ce(skb); else if ((outer & INET_ECN_MASK) == INET_ECN_ECT_1) INET_ECN_set_ect1(skb); } return rc; } static inline int IP_ECN_decapsulate(const struct iphdr *oiph, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, oiph->tos, inner); } static inline int IP6_ECN_decapsulate(const struct ipv6hdr *oipv6h, struct sk_buff *skb) { __u8 inner; switch (skb_protocol(skb, true)) { case htons(ETH_P_IP): inner = ip_hdr(skb)->tos; break; case htons(ETH_P_IPV6): inner = ipv6_get_dsfield(ipv6_hdr(skb)); break; default: return 0; } return INET_ECN_decapsulate(skb, ipv6_get_dsfield(oipv6h), inner); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_MM_H #define _LINUX_SCHED_MM_H #include <linux/kernel.h> #include <linux/atomic.h> #include <linux/sched.h> #include <linux/mm_types.h> #include <linux/gfp.h> #include <linux/sync_core.h> /* * Routines for handling mm_structs */ extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. * @mm: The &struct mm_struct to pin. * * Make sure that @mm will not get freed even after the owning task * exits. This doesn't guarantee that the associated address space * will still exist later on and mmget_not_zero() has to be used before * accessing it. * * This is a preferred way to pin @mm for a longer/unbounded amount * of time. * * Use mmdrop() to release the reference acquired by mmgrab(). * * See also <Documentation/vm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmgrab(struct mm_struct *mm) { atomic_inc(&mm->mm_count); } extern void __mmdrop(struct mm_struct *mm); static inline void mmdrop(struct mm_struct *mm) { /* * The implicit full barrier implied by atomic_dec_and_test() is * required by the membarrier system call before returning to * user-space, after storing to rq->curr. */ if (unlikely(atomic_dec_and_test(&mm->mm_count))) __mmdrop(mm); } /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. * * Make sure that the address space of the given &struct mm_struct doesn't * go away. This does not protect against parts of the address space being * modified or freed, however. * * Never use this function to pin this address space for an * unbounded/indefinite amount of time. * * Use mmput() to release the reference acquired by mmget(). * * See also <Documentation/vm/active_mm.rst> for an in-depth explanation * of &mm_struct.mm_count vs &mm_struct.mm_users. */ static inline void mmget(struct mm_struct *mm) { atomic_inc(&mm->mm_users); } static inline bool mmget_not_zero(struct mm_struct *mm) { return atomic_inc_not_zero(&mm->mm_users); } /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); #ifdef CONFIG_MMU /* same as above but performs the slow path from the async context. Can * be called from the atomic context as well */ void mmput_async(struct mm_struct *); #endif /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* * Grab a reference to a task's mm, if it is not already going away * and ptrace_may_access with the mode parameter passed to it * succeeds. */ extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode); /* Remove the current tasks stale references to the old mm_struct on exit() */ extern void exit_mm_release(struct task_struct *, struct mm_struct *); /* Remove the current tasks stale references to the old mm_struct on exec() */ extern void exec_mm_release(struct task_struct *, struct mm_struct *); #ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); #else static inline void mm_update_next_owner(struct mm_struct *mm) { } #endif /* CONFIG_MEMCG */ #ifdef CONFIG_MMU extern void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack); extern unsigned long arch_get_unmapped_area(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); extern unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); #else static inline void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack) {} #endif static inline bool in_vfork(struct task_struct *tsk) { bool ret; /* * need RCU to access ->real_parent if CLONE_VM was used along with * CLONE_PARENT. * * We check real_parent->mm == tsk->mm because CLONE_VFORK does not * imply CLONE_VM * * CLONE_VFORK can be used with CLONE_PARENT/CLONE_THREAD and thus * ->real_parent is not necessarily the task doing vfork(), so in * theory we can't rely on task_lock() if we want to dereference it. * * And in this case we can't trust the real_parent->mm == tsk->mm * check, it can be false negative. But we do not care, if init or * another oom-unkillable task does this it should blame itself. */ rcu_read_lock(); ret = tsk->vfork_done && rcu_dereference(tsk->real_parent)->mm == tsk->mm; rcu_read_unlock(); return ret; } /* * Applies per-task gfp context to the given allocation flags. * PF_MEMALLOC_NOIO implies GFP_NOIO * PF_MEMALLOC_NOFS implies GFP_NOFS */ static inline gfp_t current_gfp_context(gfp_t flags) { unsigned int pflags = READ_ONCE(current->flags); if (unlikely(pflags & (PF_MEMALLOC_NOIO | PF_MEMALLOC_NOFS))) { /* * NOIO implies both NOIO and NOFS and it is a weaker context * so always make sure it makes precedence */ if (pflags & PF_MEMALLOC_NOIO) flags &= ~(__GFP_IO | __GFP_FS); else if (pflags & PF_MEMALLOC_NOFS) flags &= ~__GFP_FS; } return flags; } #ifdef CONFIG_LOCKDEP extern void __fs_reclaim_acquire(void); extern void __fs_reclaim_release(void); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else static inline void __fs_reclaim_acquire(void) { } static inline void __fs_reclaim_release(void) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif /** * memalloc_noio_save - Marks implicit GFP_NOIO allocation scope. * * This functions marks the beginning of the GFP_NOIO allocation scope. * All further allocations will implicitly drop __GFP_IO flag and so * they are safe for the IO critical section from the allocation recursion * point of view. Use memalloc_noio_restore to end the scope with flags * returned by this function. * * This function is safe to be used from any context. */ static inline unsigned int memalloc_noio_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_NOIO; current->flags |= PF_MEMALLOC_NOIO; return flags; } /** * memalloc_noio_restore - Ends the implicit GFP_NOIO scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOIO scope started by memalloc_noio_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_noio_save call. */ static inline void memalloc_noio_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags; } /** * memalloc_nofs_save - Marks implicit GFP_NOFS allocation scope. * * This functions marks the beginning of the GFP_NOFS allocation scope. * All further allocations will implicitly drop __GFP_FS flag and so * they are safe for the FS critical section from the allocation recursion * point of view. Use memalloc_nofs_restore to end the scope with flags * returned by this function. * * This function is safe to be used from any context. */ static inline unsigned int memalloc_nofs_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_NOFS; current->flags |= PF_MEMALLOC_NOFS; return flags; } /** * memalloc_nofs_restore - Ends the implicit GFP_NOFS scope. * @flags: Flags to restore. * * Ends the implicit GFP_NOFS scope started by memalloc_nofs_save function. * Always make sure that the given flags is the return value from the * pairing memalloc_nofs_save call. */ static inline void memalloc_nofs_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_NOFS) | flags; } static inline unsigned int memalloc_noreclaim_save(void) { unsigned int flags = current->flags & PF_MEMALLOC; current->flags |= PF_MEMALLOC; return flags; } static inline void memalloc_noreclaim_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC) | flags; } #ifdef CONFIG_CMA static inline unsigned int memalloc_nocma_save(void) { unsigned int flags = current->flags & PF_MEMALLOC_NOCMA; current->flags |= PF_MEMALLOC_NOCMA; return flags; } static inline void memalloc_nocma_restore(unsigned int flags) { current->flags = (current->flags & ~PF_MEMALLOC_NOCMA) | flags; } #else static inline unsigned int memalloc_nocma_save(void) { return 0; } static inline void memalloc_nocma_restore(unsigned int flags) { } #endif #ifdef CONFIG_MEMCG DECLARE_PER_CPU(struct mem_cgroup *, int_active_memcg); /** * set_active_memcg - Starts the remote memcg charging scope. * @memcg: memcg to charge. * * This function marks the beginning of the remote memcg charging scope. All the * __GFP_ACCOUNT allocations till the end of the scope will be charged to the * given memcg. * * NOTE: This function can nest. Users must save the return value and * reset the previous value after their own charging scope is over. */ static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { struct mem_cgroup *old; if (in_interrupt()) { old = this_cpu_read(int_active_memcg); this_cpu_write(int_active_memcg, memcg); } else { old = current->active_memcg; current->active_memcg = memcg; } return old; } #else static inline struct mem_cgroup * set_active_memcg(struct mem_cgroup *memcg) { return NULL; } #endif #ifdef CONFIG_MEMBARRIER enum { MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY = (1U << 0), MEMBARRIER_STATE_PRIVATE_EXPEDITED = (1U << 1), MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY = (1U << 2), MEMBARRIER_STATE_GLOBAL_EXPEDITED = (1U << 3), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY = (1U << 4), MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE = (1U << 5), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ_READY = (1U << 6), MEMBARRIER_STATE_PRIVATE_EXPEDITED_RSEQ = (1U << 7), }; enum { MEMBARRIER_FLAG_SYNC_CORE = (1U << 0), MEMBARRIER_FLAG_RSEQ = (1U << 1), }; #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS #include <asm/membarrier.h> #endif static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { if (current->mm != mm) return; if (likely(!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) return; sync_core_before_usermode(); } extern void membarrier_exec_mmap(struct mm_struct *mm); #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { } #endif static inline void membarrier_exec_mmap(struct mm_struct *mm) { } static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } #endif #endif /* _LINUX_SCHED_MM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UIDGID_H #define _LINUX_UIDGID_H /* * A set of types for the internal kernel types representing uids and gids. * * The types defined in this header allow distinguishing which uids and gids in * the kernel are values used by userspace and which uid and gid values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; typedef struct { uid_t val; } kuid_t; typedef struct { gid_t val; } kgid_t; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } #ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; } static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } #else static inline uid_t __kuid_val(kuid_t uid) { return 0; } static inline gid_t __kgid_val(kgid_t gid) { return 0; } #endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) #define INVALID_UID KUIDT_INIT(-1) #define INVALID_GID KGIDT_INIT(-1) static inline bool uid_eq(kuid_t left, kuid_t right) { return __kuid_val(left) == __kuid_val(right); } static inline bool gid_eq(kgid_t left, kgid_t right) { return __kgid_val(left) == __kgid_val(right); } static inline bool uid_gt(kuid_t left, kuid_t right) { return __kuid_val(left) > __kuid_val(right); } static inline bool gid_gt(kgid_t left, kgid_t right) { return __kgid_val(left) > __kgid_val(right); } static inline bool uid_gte(kuid_t left, kuid_t right) { return __kuid_val(left) >= __kuid_val(right); } static inline bool gid_gte(kgid_t left, kgid_t right) { return __kgid_val(left) >= __kgid_val(right); } static inline bool uid_lt(kuid_t left, kuid_t right) { return __kuid_val(left) < __kuid_val(right); } static inline bool gid_lt(kgid_t left, kgid_t right) { return __kgid_val(left) < __kgid_val(right); } static inline bool uid_lte(kuid_t left, kuid_t right) { return __kuid_val(left) <= __kuid_val(right); } static inline bool gid_lte(kgid_t left, kgid_t right) { return __kgid_val(left) <= __kgid_val(right); } static inline bool uid_valid(kuid_t uid) { return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS extern kuid_t make_kuid(struct user_namespace *from, uid_t uid); extern kgid_t make_kgid(struct user_namespace *from, gid_t gid); extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return from_kuid(ns, uid) != (uid_t) -1; } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return from_kgid(ns, gid) != (gid_t) -1; } #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) { return KUIDT_INIT(uid); } static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) { return KGIDT_INIT(gid); } static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid) { return __kuid_val(kuid); } static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid) { return __kgid_val(kgid); } static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) { uid_t uid = from_kuid(to, kuid); if (uid == (uid_t)-1) uid = overflowuid; return uid; } static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) { gid_t gid = from_kgid(to, kgid); if (gid == (gid_t)-1) gid = overflowgid; return gid; } static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return gid_valid(gid); } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 /* SPDX-License-Identifier: GPL-2.0-only */ /* * NSA Security-Enhanced Linux (SELinux) security module * * This file contains the SELinux security data structures for kernel objects. * * Author(s): Stephen Smalley, <sds@tycho.nsa.gov> * Chris Vance, <cvance@nai.com> * Wayne Salamon, <wsalamon@nai.com> * James Morris <jmorris@redhat.com> * * Copyright (C) 2001,2002 Networks Associates Technology, Inc. * Copyright (C) 2003 Red Hat, Inc., James Morris <jmorris@redhat.com> * Copyright (C) 2016 Mellanox Technologies */ #ifndef _SELINUX_OBJSEC_H_ #define _SELINUX_OBJSEC_H_ #include <linux/list.h> #include <linux/sched.h> #include <linux/fs.h> #include <linux/binfmts.h> #include <linux/in.h> #include <linux/spinlock.h> #include <linux/lsm_hooks.h> #include <linux/msg.h> #include <net/net_namespace.h> #include "flask.h" #include "avc.h" struct task_security_struct { u32 osid; /* SID prior to last execve */ u32 sid; /* current SID */ u32 exec_sid; /* exec SID */ u32 create_sid; /* fscreate SID */ u32 keycreate_sid; /* keycreate SID */ u32 sockcreate_sid; /* fscreate SID */ } __randomize_layout; enum label_initialized { LABEL_INVALID, /* invalid or not initialized */ LABEL_INITIALIZED, /* initialized */ LABEL_PENDING }; struct inode_security_struct { struct inode *inode; /* back pointer to inode object */ struct list_head list; /* list of inode_security_struct */ u32 task_sid; /* SID of creating task */ u32 sid; /* SID of this object */ u16 sclass; /* security class of this object */ unsigned char initialized; /* initialization flag */ spinlock_t lock; }; struct file_security_struct { u32 sid; /* SID of open file description */ u32 fown_sid; /* SID of file owner (for SIGIO) */ u32 isid; /* SID of inode at the time of file open */ u32 pseqno; /* Policy seqno at the time of file open */ }; struct superblock_security_struct { struct super_block *sb; /* back pointer to sb object */ u32 sid; /* SID of file system superblock */ u32 def_sid; /* default SID for labeling */ u32 mntpoint_sid; /* SECURITY_FS_USE_MNTPOINT context for files */ unsigned short behavior; /* labeling behavior */ unsigned short flags; /* which mount options were specified */ struct mutex lock; struct list_head isec_head; spinlock_t isec_lock; }; struct msg_security_struct { u32 sid; /* SID of message */ }; struct ipc_security_struct { u16 sclass; /* security class of this object */ u32 sid; /* SID of IPC resource */ }; struct netif_security_struct { struct net *ns; /* network namespace */ int ifindex; /* device index */ u32 sid; /* SID for this interface */ }; struct netnode_security_struct { union { __be32 ipv4; /* IPv4 node address */ struct in6_addr ipv6; /* IPv6 node address */ } addr; u32 sid; /* SID for this node */ u16 family; /* address family */ }; struct netport_security_struct { u32 sid; /* SID for this node */ u16 port; /* port number */ u8 protocol; /* transport protocol */ }; struct sk_security_struct { #ifdef CONFIG_NETLABEL enum { /* NetLabel state */ NLBL_UNSET = 0, NLBL_REQUIRE, NLBL_LABELED, NLBL_REQSKB, NLBL_CONNLABELED, } nlbl_state; struct netlbl_lsm_secattr *nlbl_secattr; /* NetLabel sec attributes */ #endif u32 sid; /* SID of this object */ u32 peer_sid; /* SID of peer */ u16 sclass; /* sock security class */ enum { /* SCTP association state */ SCTP_ASSOC_UNSET = 0, SCTP_ASSOC_SET, } sctp_assoc_state; }; struct tun_security_struct { u32 sid; /* SID for the tun device sockets */ }; struct key_security_struct { u32 sid; /* SID of key */ }; struct ib_security_struct { u32 sid; /* SID of the queue pair or MAD agent */ }; struct pkey_security_struct { u64 subnet_prefix; /* Port subnet prefix */ u16 pkey; /* PKey number */ u32 sid; /* SID of pkey */ }; struct bpf_security_struct { u32 sid; /* SID of bpf obj creator */ }; struct perf_event_security_struct { u32 sid; /* SID of perf_event obj creator */ }; extern struct lsm_blob_sizes selinux_blob_sizes; static inline struct task_security_struct *selinux_cred(const struct cred *cred) { return cred->security + selinux_blob_sizes.lbs_cred; } static inline struct file_security_struct *selinux_file(const struct file *file) { return file->f_security + selinux_blob_sizes.lbs_file; } static inline struct inode_security_struct *selinux_inode( const struct inode *inode) { if (unlikely(!inode->i_security)) return NULL; return inode->i_security + selinux_blob_sizes.lbs_inode; } static inline struct msg_security_struct *selinux_msg_msg( const struct msg_msg *msg_msg) { return msg_msg->security + selinux_blob_sizes.lbs_msg_msg; } static inline struct ipc_security_struct *selinux_ipc( const struct kern_ipc_perm *ipc) { return ipc->security + selinux_blob_sizes.lbs_ipc; } /* * get the subjective security ID of the current task */ static inline u32 current_sid(void) { const struct task_security_struct *tsec = selinux_cred(current_cred()); return tsec->sid; } #endif /* _SELINUX_OBJSEC_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_LOCAL_H #define _ASM_X86_LOCAL_H #include <linux/percpu.h> #include <linux/atomic.h> #include <asm/asm.h> typedef struct { atomic_long_t a; } local_t; #define LOCAL_INIT(i) { ATOMIC_LONG_INIT(i) } #define local_read(l) atomic_long_read(&(l)->a) #define local_set(l, i) atomic_long_set(&(l)->a, (i)) static inline void local_inc(local_t *l) { asm volatile(_ASM_INC "%0" : "+m" (l->a.counter)); } static inline void local_dec(local_t *l) { asm volatile(_ASM_DEC "%0" : "+m" (l->a.counter)); } static inline void local_add(long i, local_t *l) { asm volatile(_ASM_ADD "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } static inline void local_sub(long i, local_t *l) { asm volatile(_ASM_SUB "%1,%0" : "+m" (l->a.counter) : "ir" (i)); } /** * local_sub_and_test - subtract value from variable and test result * @i: integer value to subtract * @l: pointer to type local_t * * Atomically subtracts @i from @l and returns * true if the result is zero, or false for all * other cases. */ static inline bool local_sub_and_test(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, e, "er", i); } /** * local_dec_and_test - decrement and test * @l: pointer to type local_t * * Atomically decrements @l by 1 and * returns true if the result is 0, or false for all other * cases. */ static inline bool local_dec_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, e); } /** * local_inc_and_test - increment and test * @l: pointer to type local_t * * Atomically increments @l by 1 * and returns true if the result is zero, or false for all * other cases. */ static inline bool local_inc_and_test(local_t *l) { return GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, e); } /** * local_add_negative - add and test if negative * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns true * if the result is negative, or false when * result is greater than or equal to zero. */ static inline bool local_add_negative(long i, local_t *l) { return GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, s, "er", i); } /** * local_add_return - add and return * @i: integer value to add * @l: pointer to type local_t * * Atomically adds @i to @l and returns @i + @l */ static inline long local_add_return(long i, local_t *l) { long __i = i; asm volatile(_ASM_XADD "%0, %1;" : "+r" (i), "+m" (l->a.counter) : : "memory"); return i + __i; } static inline long local_sub_return(long i, local_t *l) { return local_add_return(-i, l); } #define local_inc_return(l) (local_add_return(1, l)) #define local_dec_return(l) (local_sub_return(1, l)) #define local_cmpxchg(l, o, n) \ (cmpxchg_local(&((l)->a.counter), (o), (n))) /* Always has a lock prefix */ #define local_xchg(l, n) (xchg(&((l)->a.counter), (n))) /** * local_add_unless - add unless the number is a given value * @l: pointer of type local_t * @a: the amount to add to l... * @u: ...unless l is equal to u. * * Atomically adds @a to @l, so long as it was not @u. * Returns non-zero if @l was not @u, and zero otherwise. */ #define local_add_unless(l, a, u) \ ({ \ long c, old; \ c = local_read((l)); \ for (;;) { \ if (unlikely(c == (u))) \ break; \ old = local_cmpxchg((l), c, c + (a)); \ if (likely(old == c)) \ break; \ c = old; \ } \ c != (u); \ }) #define local_inc_not_zero(l) local_add_unless((l), 1, 0) /* On x86_32, these are no better than the atomic variants. * On x86-64 these are better than the atomic variants on SMP kernels * because they dont use a lock prefix. */ #define __local_inc(l) local_inc(l) #define __local_dec(l) local_dec(l) #define __local_add(i, l) local_add((i), (l)) #define __local_sub(i, l) local_sub((i), (l)) #endif /* _ASM_X86_LOCAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_CGROUP_H #define _LINUX_CGROUP_H /* * cgroup interface * * Copyright (C) 2003 BULL SA * Copyright (C) 2004-2006 Silicon Graphics, Inc. * */ #include <linux/sched.h> #include <linux/cpumask.h> #include <linux/nodemask.h> #include <linux/rculist.h> #include <linux/cgroupstats.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/kernfs.h> #include <linux/jump_label.h> #include <linux/types.h> #include <linux/ns_common.h> #include <linux/nsproxy.h> #include <linux/user_namespace.h> #include <linux/refcount.h> #include <linux/kernel_stat.h> #include <linux/cgroup-defs.h> struct kernel_clone_args; #ifdef CONFIG_CGROUPS /* * All weight knobs on the default hierarhcy should use the following min, * default and max values. The default value is the logarithmic center of * MIN and MAX and allows 100x to be expressed in both directions. */ #define CGROUP_WEIGHT_MIN 1 #define CGROUP_WEIGHT_DFL 100 #define CGROUP_WEIGHT_MAX 10000 /* walk only threadgroup leaders */ #define CSS_TASK_ITER_PROCS (1U << 0) /* walk all threaded css_sets in the domain */ #define CSS_TASK_ITER_THREADED (1U << 1) /* internal flags */ #define CSS_TASK_ITER_SKIPPED (1U << 16) /* a css_task_iter should be treated as an opaque object */ struct css_task_iter { struct cgroup_subsys *ss; unsigned int flags; struct list_head *cset_pos; struct list_head *cset_head; struct list_head *tcset_pos; struct list_head *tcset_head; struct list_head *task_pos; struct list_head *cur_tasks_head; struct css_set *cur_cset; struct css_set *cur_dcset; struct task_struct *cur_task; struct list_head iters_node; /* css_set->task_iters */ }; extern struct cgroup_root cgrp_dfl_root; extern struct css_set init_css_set; #define SUBSYS(_x) extern struct cgroup_subsys _x ## _cgrp_subsys; #include <linux/cgroup_subsys.h> #undef SUBSYS #define SUBSYS(_x) \ extern struct static_key_true _x ## _cgrp_subsys_enabled_key; \ extern struct static_key_true _x ## _cgrp_subsys_on_dfl_key; #include <linux/cgroup_subsys.h> #undef SUBSYS /** * cgroup_subsys_enabled - fast test on whether a subsys is enabled * @ss: subsystem in question */ #define cgroup_subsys_enabled(ss) \ static_branch_likely(&ss ## _enabled_key) /** * cgroup_subsys_on_dfl - fast test on whether a subsys is on default hierarchy * @ss: subsystem in question */ #define cgroup_subsys_on_dfl(ss) \ static_branch_likely(&ss ## _on_dfl_key) bool css_has_online_children(struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *cgroup_get_e_css(struct cgroup *cgroup, struct cgroup_subsys *ss); struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss); struct cgroup *cgroup_get_from_path(const char *path); struct cgroup *cgroup_get_from_fd(int fd); int cgroup_attach_task_all(struct task_struct *from, struct task_struct *); int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); int cgroup_rm_cftypes(struct cftype *cfts); void cgroup_file_notify(struct cgroup_file *cfile); int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen); int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry); int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *tsk); void cgroup_fork(struct task_struct *p); extern int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); void cgroup_exit(struct task_struct *p); void cgroup_release(struct task_struct *p); void cgroup_free(struct task_struct *p); int cgroup_init_early(void); int cgroup_init(void); int cgroup_parse_float(const char *input, unsigned dec_shift, s64 *v); /* * Iteration helpers and macros. */ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *parent); struct cgroup_subsys_state *css_next_descendant_pre(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct cgroup_subsys_state *css_rightmost_descendant(struct cgroup_subsys_state *pos); struct cgroup_subsys_state *css_next_descendant_post(struct cgroup_subsys_state *pos, struct cgroup_subsys_state *css); struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, struct cgroup_subsys_state **dst_cssp); void css_task_iter_start(struct cgroup_subsys_state *css, unsigned int flags, struct css_task_iter *it); struct task_struct *css_task_iter_next(struct css_task_iter *it); void css_task_iter_end(struct css_task_iter *it); /** * css_for_each_child - iterate through children of a css * @pos: the css * to use as the loop cursor * @parent: css whose children to walk * * Walk @parent's children. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_child(pos, parent) \ for ((pos) = css_next_child(NULL, (parent)); (pos); \ (pos) = css_next_child((pos), (parent))) /** * css_for_each_descendant_pre - pre-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @root: css whose descendants to walk * * Walk @root's descendants. @root is included in the iteration and the * first node to be visited. Must be called under rcu_read_lock(). * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * For example, the following guarantees that a descendant can't escape * state updates of its ancestors. * * my_online(@css) * { * Lock @css's parent and @css; * Inherit state from the parent; * Unlock both. * } * * my_update_state(@css) * { * css_for_each_descendant_pre(@pos, @css) { * Lock @pos; * if (@pos == @css) * Update @css's state; * else * Verify @pos is alive and inherit state from its parent; * Unlock @pos; * } * } * * As long as the inheriting step, including checking the parent state, is * enclosed inside @pos locking, double-locking the parent isn't necessary * while inheriting. The state update to the parent is guaranteed to be * visible by walking order and, as long as inheriting operations to the * same @pos are atomic to each other, multiple updates racing each other * still result in the correct state. It's guaranateed that at least one * inheritance happens for any css after the latest update to its parent. * * If checking parent's state requires locking the parent, each inheriting * iteration should lock and unlock both @pos->parent and @pos. * * Alternatively, a subsystem may choose to use a single global lock to * synchronize ->css_online() and ->css_offline() against tree-walking * operations. * * It is allowed to temporarily drop RCU read lock during iteration. The * caller is responsible for ensuring that @pos remains accessible until * the start of the next iteration by, for example, bumping the css refcnt. */ #define css_for_each_descendant_pre(pos, css) \ for ((pos) = css_next_descendant_pre(NULL, (css)); (pos); \ (pos) = css_next_descendant_pre((pos), (css))) /** * css_for_each_descendant_post - post-order walk of a css's descendants * @pos: the css * to use as the loop cursor * @css: css whose descendants to walk * * Similar to css_for_each_descendant_pre() but performs post-order * traversal instead. @root is included in the iteration and the last * node to be visited. * * If a subsystem synchronizes ->css_online() and the start of iteration, a * css which finished ->css_online() is guaranteed to be visible in the * future iterations and will stay visible until the last reference is put. * A css which hasn't finished ->css_online() or already finished * ->css_offline() may show up during traversal. It's each subsystem's * responsibility to synchronize against on/offlining. * * Note that the walk visibility guarantee example described in pre-order * walk doesn't apply the same to post-order walks. */ #define css_for_each_descendant_post(pos, css) \ for ((pos) = css_next_descendant_post(NULL, (css)); (pos); \ (pos) = css_next_descendant_post((pos), (css))) /** * cgroup_taskset_for_each - iterate cgroup_taskset * @task: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * @tset may contain multiple tasks and they may belong to multiple * processes. * * On the v2 hierarchy, there may be tasks from multiple processes and they * may not share the source or destination csses. * * On traditional hierarchies, when there are multiple tasks in @tset, if a * task of a process is in @tset, all tasks of the process are in @tset. * Also, all are guaranteed to share the same source and destination csses. * * Iteration is not in any specific order. */ #define cgroup_taskset_for_each(task, dst_css, tset) \ for ((task) = cgroup_taskset_first((tset), &(dst_css)); \ (task); \ (task) = cgroup_taskset_next((tset), &(dst_css))) /** * cgroup_taskset_for_each_leader - iterate group leaders in a cgroup_taskset * @leader: the loop cursor * @dst_css: the destination css * @tset: taskset to iterate * * Iterate threadgroup leaders of @tset. For single-task migrations, @tset * may not contain any. */ #define cgroup_taskset_for_each_leader(leader, dst_css, tset) \ for ((leader) = cgroup_taskset_first((tset), &(dst_css)); \ (leader); \ (leader) = cgroup_taskset_next((tset), &(dst_css))) \ if ((leader) != (leader)->group_leader) \ ; \ else /* * Inline functions. */ static inline u64 cgroup_id(struct cgroup *cgrp) { return cgrp->kn->id; } /** * css_get - obtain a reference on the specified css * @css: target css * * The caller must already have a reference. */ static inline void css_get(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get(&css->refcnt); } /** * css_get_many - obtain references on the specified css * @css: target css * @n: number of references to get * * The caller must already have a reference. */ static inline void css_get_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_get_many(&css->refcnt, n); } /** * css_tryget - try to obtain a reference on the specified css * @css: target css * * Obtain a reference on @css unless it already has reached zero and is * being released. This function doesn't care whether @css is on or * offline. The caller naturally needs to ensure that @css is accessible * but doesn't have to be holding a reference on it - IOW, RCU protected * access is good enough for this function. Returns %true if a reference * count was successfully obtained; %false otherwise. */ static inline bool css_tryget(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget(&css->refcnt); return true; } /** * css_tryget_online - try to obtain a reference on the specified css if online * @css: target css * * Obtain a reference on @css if it's online. The caller naturally needs * to ensure that @css is accessible but doesn't have to be holding a * reference on it - IOW, RCU protected access is good enough for this * function. Returns %true if a reference count was successfully obtained; * %false otherwise. */ static inline bool css_tryget_online(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) return percpu_ref_tryget_live(&css->refcnt); return true; } /** * css_is_dying - test whether the specified css is dying * @css: target css * * Test whether @css is in the process of offlining or already offline. In * most cases, ->css_online() and ->css_offline() callbacks should be * enough; however, the actual offline operations are RCU delayed and this * test returns %true also when @css is scheduled to be offlined. * * This is useful, for example, when the use case requires synchronous * behavior with respect to cgroup removal. cgroup removal schedules css * offlining but the css can seem alive while the operation is being * delayed. If the delay affects user visible semantics, this test can be * used to resolve the situation. */ static inline bool css_is_dying(struct cgroup_subsys_state *css) { return !(css->flags & CSS_NO_REF) && percpu_ref_is_dying(&css->refcnt); } /** * css_put - put a css reference * @css: target css * * Put a reference obtained via css_get() and css_tryget_online(). */ static inline void css_put(struct cgroup_subsys_state *css) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put(&css->refcnt); } /** * css_put_many - put css references * @css: target css * @n: number of references to put * * Put references obtained via css_get() and css_tryget_online(). */ static inline void css_put_many(struct cgroup_subsys_state *css, unsigned int n) { if (!(css->flags & CSS_NO_REF)) percpu_ref_put_many(&css->refcnt, n); } static inline void cgroup_get(struct cgroup *cgrp) { css_get(&cgrp->self); } static inline bool cgroup_tryget(struct cgroup *cgrp) { return css_tryget(&cgrp->self); } static inline void cgroup_put(struct cgroup *cgrp) { css_put(&cgrp->self); } /** * task_css_set_check - obtain a task's css_set with extra access conditions * @task: the task to obtain css_set for * @__c: extra condition expression to be passed to rcu_dereference_check() * * A task's css_set is RCU protected, initialized and exited while holding * task_lock(), and can only be modified while holding both cgroup_mutex * and task_lock() while the task is alive. This macro verifies that the * caller is inside proper critical section and returns @task's css_set. * * The caller can also specify additional allowed conditions via @__c, such * as locks used during the cgroup_subsys::attach() methods. */ #ifdef CONFIG_PROVE_RCU extern struct mutex cgroup_mutex; extern spinlock_t css_set_lock; #define task_css_set_check(task, __c) \ rcu_dereference_check((task)->cgroups, \ lockdep_is_held(&cgroup_mutex) || \ lockdep_is_held(&css_set_lock) || \ ((task)->flags & PF_EXITING) || (__c)) #else #define task_css_set_check(task, __c) \ rcu_dereference((task)->cgroups) #endif /** * task_css_check - obtain css for (task, subsys) w/ extra access conds * @task: the target task * @subsys_id: the target subsystem ID * @__c: extra condition expression to be passed to rcu_dereference_check() * * Return the cgroup_subsys_state for the (@task, @subsys_id) pair. The * synchronization rules are the same as task_css_set_check(). */ #define task_css_check(task, subsys_id, __c) \ task_css_set_check((task), (__c))->subsys[(subsys_id)] /** * task_css_set - obtain a task's css_set * @task: the task to obtain css_set for * * See task_css_set_check(). */ static inline struct css_set *task_css_set(struct task_struct *task) { return task_css_set_check(task, false); } /** * task_css - obtain css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * See task_css_check(). */ static inline struct cgroup_subsys_state *task_css(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, false); } /** * task_get_css - find and get the css for (task, subsys) * @task: the target task * @subsys_id: the target subsystem ID * * Find the css for the (@task, @subsys_id) combination, increment a * reference on and return it. This function is guaranteed to return a * valid css. The returned css may already have been offlined. */ static inline struct cgroup_subsys_state * task_get_css(struct task_struct *task, int subsys_id) { struct cgroup_subsys_state *css; rcu_read_lock(); while (true) { css = task_css(task, subsys_id); /* * Can't use css_tryget_online() here. A task which has * PF_EXITING set may stay associated with an offline css. * If such task calls this function, css_tryget_online() * will keep failing. */ if (likely(css_tryget(css))) break; cpu_relax(); } rcu_read_unlock(); return css; } /** * task_css_is_root - test whether a task belongs to the root css * @task: the target task * @subsys_id: the target subsystem ID * * Test whether @task belongs to the root css on the specified subsystem. * May be invoked in any context. */ static inline bool task_css_is_root(struct task_struct *task, int subsys_id) { return task_css_check(task, subsys_id, true) == init_css_set.subsys[subsys_id]; } static inline struct cgroup *task_cgroup(struct task_struct *task, int subsys_id) { return task_css(task, subsys_id)->cgroup; } static inline struct cgroup *task_dfl_cgroup(struct task_struct *task) { return task_css_set(task)->dfl_cgrp; } static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { struct cgroup_subsys_state *parent_css = cgrp->self.parent; if (parent_css) return container_of(parent_css, struct cgroup, self); return NULL; } /** * cgroup_is_descendant - test ancestry * @cgrp: the cgroup to be tested * @ancestor: possible ancestor of @cgrp * * Test whether @cgrp is a descendant of @ancestor. It also returns %true * if @cgrp == @ancestor. This function is safe to call as long as @cgrp * and @ancestor are accessible. */ static inline bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) { if (cgrp->root != ancestor->root || cgrp->level < ancestor->level) return false; return cgrp->ancestor_ids[ancestor->level] == cgroup_id(ancestor); } /** * cgroup_ancestor - find ancestor of cgroup * @cgrp: cgroup to find ancestor of * @ancestor_level: level of ancestor to find starting from root * * Find ancestor of cgroup at specified level starting from root if it exists * and return pointer to it. Return NULL if @cgrp doesn't have ancestor at * @ancestor_level. * * This function is safe to call as long as @cgrp is accessible. */ static inline struct cgroup *cgroup_ancestor(struct cgroup *cgrp, int ancestor_level) { if (cgrp->level < ancestor_level) return NULL; while (cgrp && cgrp->level > ancestor_level) cgrp = cgroup_parent(cgrp); return cgrp; } /** * task_under_cgroup_hierarchy - test task's membership of cgroup ancestry * @task: the task to be tested * @ancestor: possible ancestor of @task's cgroup * * Tests whether @task's default cgroup hierarchy is a descendant of @ancestor. * It follows all the same rules as cgroup_is_descendant, and only applies * to the default hierarchy. */ static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { struct css_set *cset = task_css_set(task); return cgroup_is_descendant(cset->dfl_cgrp, ancestor); } /* no synchronization, the result can only be used as a hint */ static inline bool cgroup_is_populated(struct cgroup *cgrp) { return cgrp->nr_populated_csets + cgrp->nr_populated_domain_children + cgrp->nr_populated_threaded_children; } /* returns ino associated with a cgroup */ static inline ino_t cgroup_ino(struct cgroup *cgrp) { return kernfs_ino(cgrp->kn); } /* cft/css accessors for cftype->write() operation */ static inline struct cftype *of_cft(struct kernfs_open_file *of) { return of->kn->priv; } struct cgroup_subsys_state *of_css(struct kernfs_open_file *of); /* cft/css accessors for cftype->seq_*() operations */ static inline struct cftype *seq_cft(struct seq_file *seq) { return of_cft(seq->private); } static inline struct cgroup_subsys_state *seq_css(struct seq_file *seq) { return of_css(seq->private); } /* * Name / path handling functions. All are thin wrappers around the kernfs * counterparts and can be called under any context. */ static inline int cgroup_name(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_name(cgrp->kn, buf, buflen); } static inline int cgroup_path(struct cgroup *cgrp, char *buf, size_t buflen) { return kernfs_path(cgrp->kn, buf, buflen); } static inline void pr_cont_cgroup_name(struct cgroup *cgrp) { pr_cont_kernfs_name(cgrp->kn); } static inline void pr_cont_cgroup_path(struct cgroup *cgrp) { pr_cont_kernfs_path(cgrp->kn); } static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { return &cgrp->psi; } static inline void cgroup_init_kthreadd(void) { /* * kthreadd is inherited by all kthreads, keep it in the root so * that the new kthreads are guaranteed to stay in the root until * initialization is finished. */ current->no_cgroup_migration = 1; } static inline void cgroup_kthread_ready(void) { /* * This kthread finished initialization. The creator should have * set PF_NO_SETAFFINITY if this kthread should stay in the root. */ current->no_cgroup_migration = 0; } void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen); #else /* !CONFIG_CGROUPS */ struct cgroup_subsys_state; struct cgroup; static inline u64 cgroup_id(struct cgroup *cgrp) { return 1; } static inline void css_get(struct cgroup_subsys_state *css) {} static inline void css_put(struct cgroup_subsys_state *css) {} static inline int cgroup_attach_task_all(struct task_struct *from, struct task_struct *t) { return 0; } static inline int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) { return -EINVAL; } static inline void cgroup_fork(struct task_struct *p) {} static inline int cgroup_can_fork(struct task_struct *p, struct kernel_clone_args *kargs) { return 0; } static inline void cgroup_cancel_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) {} static inline void cgroup_exit(struct task_struct *p) {} static inline void cgroup_release(struct task_struct *p) {} static inline void cgroup_free(struct task_struct *p) {} static inline int cgroup_init_early(void) { return 0; } static inline int cgroup_init(void) { return 0; } static inline void cgroup_init_kthreadd(void) {} static inline void cgroup_kthread_ready(void) {} static inline struct cgroup *cgroup_parent(struct cgroup *cgrp) { return NULL; } static inline struct psi_group *cgroup_psi(struct cgroup *cgrp) { return NULL; } static inline bool task_under_cgroup_hierarchy(struct task_struct *task, struct cgroup *ancestor) { return true; } static inline void cgroup_path_from_kernfs_id(u64 id, char *buf, size_t buflen) {} #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUPS /* * cgroup scalable recursive statistics. */ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu); void cgroup_rstat_flush(struct cgroup *cgrp); void cgroup_rstat_flush_irqsafe(struct cgroup *cgrp); void cgroup_rstat_flush_hold(struct cgroup *cgrp); void cgroup_rstat_flush_release(void); /* * Basic resource stats. */ #ifdef CONFIG_CGROUP_CPUACCT void cpuacct_charge(struct task_struct *tsk, u64 cputime); void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} static inline void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) {} #endif void __cgroup_account_cputime(struct cgroup *cgrp, u64 delta_exec); void __cgroup_account_cputime_field(struct cgroup *cgrp, enum cpu_usage_stat index, u64 delta_exec); static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) { struct cgroup *cgrp; cpuacct_charge(task, delta_exec); rcu_read_lock(); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime(cgrp, delta_exec); rcu_read_unlock(); } static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) { struct cgroup *cgrp; cpuacct_account_field(task, index, delta_exec); rcu_read_lock(); cgrp = task_dfl_cgroup(task); if (cgroup_parent(cgrp)) __cgroup_account_cputime_field(cgrp, index, delta_exec); rcu_read_unlock(); } #else /* CONFIG_CGROUPS */ static inline void cgroup_account_cputime(struct task_struct *task, u64 delta_exec) {} static inline void cgroup_account_cputime_field(struct task_struct *task, enum cpu_usage_stat index, u64 delta_exec) {} #endif /* CONFIG_CGROUPS */ /* * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data * definition in cgroup-defs.h. */ #ifdef CONFIG_SOCK_CGROUP_DATA #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) extern spinlock_t cgroup_sk_update_lock; #endif void cgroup_sk_alloc_disable(void); void cgroup_sk_alloc(struct sock_cgroup_data *skcd); void cgroup_sk_clone(struct sock_cgroup_data *skcd); void cgroup_sk_free(struct sock_cgroup_data *skcd); static inline struct cgroup *sock_cgroup_ptr(struct sock_cgroup_data *skcd) { #if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) unsigned long v; /* * @skcd->val is 64bit but the following is safe on 32bit too as we * just need the lower ulong to be written and read atomically. */ v = READ_ONCE(skcd->val); if (v & 3) return &cgrp_dfl_root.cgrp; return (struct cgroup *)(unsigned long)v ?: &cgrp_dfl_root.cgrp; #else return (struct cgroup *)(unsigned long)skcd->val; #endif } #else /* CONFIG_CGROUP_DATA */ static inline void cgroup_sk_alloc(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_clone(struct sock_cgroup_data *skcd) {} static inline void cgroup_sk_free(struct sock_cgroup_data *skcd) {} #endif /* CONFIG_CGROUP_DATA */ struct cgroup_namespace { refcount_t count; struct ns_common ns; struct user_namespace *user_ns; struct ucounts *ucounts; struct css_set *root_cset; }; extern struct cgroup_namespace init_cgroup_ns; #ifdef CONFIG_CGROUPS void free_cgroup_ns(struct cgroup_namespace *ns); struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns); int cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, struct cgroup_namespace *ns); #else /* !CONFIG_CGROUPS */ static inline void free_cgroup_ns(struct cgroup_namespace *ns) { } static inline struct cgroup_namespace * copy_cgroup_ns(unsigned long flags, struct user_namespace *user_ns, struct cgroup_namespace *old_ns) { return old_ns; } #endif /* !CONFIG_CGROUPS */ static inline void get_cgroup_ns(struct cgroup_namespace *ns) { if (ns) refcount_inc(&ns->count); } static inline void put_cgroup_ns(struct cgroup_namespace *ns) { if (ns && refcount_dec_and_test(&ns->count)) free_cgroup_ns(ns); } #ifdef CONFIG_CGROUPS void cgroup_enter_frozen(void); void cgroup_leave_frozen(bool always_leave); void cgroup_update_frozen(struct cgroup *cgrp); void cgroup_freeze(struct cgroup *cgrp, bool freeze); void cgroup_freezer_migrate_task(struct task_struct *task, struct cgroup *src, struct cgroup *dst); static inline bool cgroup_task_freeze(struct task_struct *task) { bool ret; if (task->flags & PF_KTHREAD) return false; rcu_read_lock(); ret = test_bit(CGRP_FREEZE, &task_dfl_cgroup(task)->flags); rcu_read_unlock(); return ret; } static inline bool cgroup_task_frozen(struct task_struct *task) { return task->frozen; } #else /* !CONFIG_CGROUPS */ static inline void cgroup_enter_frozen(void) { } static inline void cgroup_leave_frozen(bool always_leave) { } static inline bool cgroup_task_freeze(struct task_struct *task) { return false; } static inline bool cgroup_task_frozen(struct task_struct *task) { return false; } #endif /* !CONFIG_CGROUPS */ #ifdef CONFIG_CGROUP_BPF static inline void cgroup_bpf_get(struct cgroup *cgrp) { percpu_ref_get(&cgrp->bpf.refcnt); } static inline void cgroup_bpf_put(struct cgroup *cgrp) { percpu_ref_put(&cgrp->bpf.refcnt); } #else /* CONFIG_CGROUP_BPF */ static inline void cgroup_bpf_get(struct cgroup *cgrp) {} static inline void cgroup_bpf_put(struct cgroup *cgrp) {} #endif /* CONFIG_CGROUP_BPF */ #endif /* _LINUX_CGROUP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H #include "blk-mq.h" #include "blk-mq-tag.h" void blk_mq_sched_assign_ioc(struct request *rq); void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async); void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); void blk_mq_sched_free_requests(struct request_queue *q); static inline bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { if (blk_queue_nomerges(q) || !bio_mergeable(bio)) return false; return __blk_mq_sched_bio_merge(q, bio, nr_segs); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { struct elevator_queue *e = q->elevator; if (e && e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { struct elevator_queue *e = rq->q->elevator; if (e && e->type->ops.completed_request) e->type->ops.completed_request(rq, now); } static inline void blk_mq_sched_requeue_request(struct request *rq) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; if (e && e->type->ops.has_work) return e->type->ops.has_work(hctx); return false; } static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) { return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } #endif
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MMZONE_H #define _LINUX_MMZONE_H #ifndef __ASSEMBLY__ #ifndef __GENERATING_BOUNDS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/wait.h> #include <linux/bitops.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/numa.h> #include <linux/init.h> #include <linux/seqlock.h> #include <linux/nodemask.h> #include <linux/pageblock-flags.h> #include <linux/page-flags-layout.h> #include <linux/atomic.h> #include <linux/mm_types.h> #include <linux/page-flags.h> #include <asm/page.h> /* Free memory management - zoned buddy allocator. */ #ifndef CONFIG_FORCE_MAX_ZONEORDER #define MAX_ORDER 11 #else #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #endif #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1)) /* * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed * costly to service. That is between allocation orders which should * coalesce naturally under reasonable reclaim pressure and those which * will not. */ #define PAGE_ALLOC_COSTLY_ORDER 3 enum migratetype { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_RECLAIMABLE, MIGRATE_PCPTYPES, /* the number of types on the pcp lists */ MIGRATE_HIGHATOMIC = MIGRATE_PCPTYPES, #ifdef CONFIG_CMA /* * MIGRATE_CMA migration type is designed to mimic the way * ZONE_MOVABLE works. Only movable pages can be allocated * from MIGRATE_CMA pageblocks and page allocator never * implicitly change migration type of MIGRATE_CMA pageblock. * * The way to use it is to change migratetype of a range of * pageblocks to MIGRATE_CMA which can be done by * __free_pageblock_cma() function. What is important though * is that a range of pageblocks must be aligned to * MAX_ORDER_NR_PAGES should biggest page be bigger then * a single pageblock. */ MIGRATE_CMA, #endif #ifdef CONFIG_MEMORY_ISOLATION MIGRATE_ISOLATE, /* can't allocate from here */ #endif MIGRATE_TYPES }; /* In mm/page_alloc.c; keep in sync also with show_migration_types() there */ extern const char * const migratetype_names[MIGRATE_TYPES]; #ifdef CONFIG_CMA # define is_migrate_cma(migratetype) unlikely((migratetype) == MIGRATE_CMA) # define is_migrate_cma_page(_page) (get_pageblock_migratetype(_page) == MIGRATE_CMA) #else # define is_migrate_cma(migratetype) false # define is_migrate_cma_page(_page) false #endif static inline bool is_migrate_movable(int mt) { return is_migrate_cma(mt) || mt == MIGRATE_MOVABLE; } #define for_each_migratetype_order(order, type) \ for (order = 0; order < MAX_ORDER; order++) \ for (type = 0; type < MIGRATE_TYPES; type++) extern int page_group_by_mobility_disabled; #define MIGRATETYPE_MASK ((1UL << PB_migratetype_bits) - 1) #define get_pageblock_migratetype(page) \ get_pfnblock_flags_mask(page, page_to_pfn(page), MIGRATETYPE_MASK) struct free_area { struct list_head free_list[MIGRATE_TYPES]; unsigned long nr_free; }; static inline struct page *get_page_from_free_area(struct free_area *area, int migratetype) { return list_first_entry_or_null(&area->free_list[migratetype], struct page, lru); } static inline bool free_area_empty(struct free_area *area, int migratetype) { return list_empty(&area->free_list[migratetype]); } struct pglist_data; /* * zone->lock and the zone lru_lock are two of the hottest locks in the kernel. * So add a wild amount of padding here to ensure that they fall into separate * cachelines. There are very few zone structures in the machine, so space * consumption is not a concern here. */ #if defined(CONFIG_SMP) struct zone_padding { char x[0]; } ____cacheline_internodealigned_in_smp; #define ZONE_PADDING(name) struct zone_padding name; #else #define ZONE_PADDING(name) #endif #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ NUMA_MISS, /* allocated in non intended node */ NUMA_FOREIGN, /* was intended here, hit elsewhere */ NUMA_INTERLEAVE_HIT, /* interleaver preferred this zone */ NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ NR_VM_NUMA_STAT_ITEMS }; #else #define NR_VM_NUMA_STAT_ITEMS 0 #endif enum zone_stat_item { /* First 128 byte cacheline (assuming 64 bit words) */ NR_FREE_PAGES, NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */ NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE, NR_ZONE_ACTIVE_ANON, NR_ZONE_INACTIVE_FILE, NR_ZONE_ACTIVE_FILE, NR_ZONE_UNEVICTABLE, NR_ZONE_WRITE_PENDING, /* Count of dirty, writeback and unstable pages */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ NR_PAGETABLE, /* used for pagetables */ /* Second 128 byte cacheline */ NR_BOUNCE, #if IS_ENABLED(CONFIG_ZSMALLOC) NR_ZSPAGES, /* allocated in zsmalloc */ #endif NR_FREE_CMA_PAGES, NR_VM_ZONE_STAT_ITEMS }; enum node_stat_item { NR_LRU_BASE, NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */ NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ NR_UNEVICTABLE, /* " " " " " */ NR_SLAB_RECLAIMABLE_B, NR_SLAB_UNRECLAIMABLE_B, NR_ISOLATED_ANON, /* Temporary isolated pages from anon lru */ NR_ISOLATED_FILE, /* Temporary isolated pages from file lru */ WORKINGSET_NODES, WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_ANON = WORKINGSET_REFAULT_BASE, WORKINGSET_REFAULT_FILE, WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_ANON = WORKINGSET_ACTIVATE_BASE, WORKINGSET_ACTIVATE_FILE, WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_ANON = WORKINGSET_RESTORE_BASE, WORKINGSET_RESTORE_FILE, WORKINGSET_NODERECLAIM, NR_ANON_MAPPED, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ NR_FILE_PAGES, NR_FILE_DIRTY, NR_WRITEBACK, NR_WRITEBACK_TEMP, /* Writeback using temporary buffers */ NR_SHMEM, /* shmem pages (included tmpfs/GEM pages) */ NR_SHMEM_THPS, NR_SHMEM_PMDMAPPED, NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ NR_WRITTEN, /* page writings since bootup */ NR_KERNEL_MISC_RECLAIMABLE, /* reclaimable non-slab kernel pages */ NR_FOLL_PIN_ACQUIRED, /* via: pin_user_page(), gup flag: FOLL_PIN */ NR_FOLL_PIN_RELEASED, /* pages returned via unpin_user_page() */ NR_KERNEL_STACK_KB, /* measured in KiB */ #if IS_ENABLED(CONFIG_SHADOW_CALL_STACK) NR_KERNEL_SCS_KB, /* measured in KiB */ #endif NR_VM_NODE_STAT_ITEMS }; /* * Returns true if the value is measured in bytes (most vmstat values are * measured in pages). This defines the API part, the internal representation * might be different. */ static __always_inline bool vmstat_item_in_bytes(int idx) { /* * Global and per-node slab counters track slab pages. * It's expected that changes are multiples of PAGE_SIZE. * Internally values are stored in pages. * * Per-memcg and per-lruvec counters track memory, consumed * by individual slab objects. These counters are actually * byte-precise. */ return (idx == NR_SLAB_RECLAIMABLE_B || idx == NR_SLAB_UNRECLAIMABLE_B); } /* * We do arithmetic on the LRU lists in various places in the code, * so it is important to keep the active lists LRU_ACTIVE higher in * the array than the corresponding inactive lists, and to keep * the *_FILE lists LRU_FILE higher than the corresponding _ANON lists. * * This has to be kept in sync with the statistics in zone_stat_item * above and the descriptions in vmstat_text in mm/vmstat.c */ #define LRU_BASE 0 #define LRU_ACTIVE 1 #define LRU_FILE 2 enum lru_list { LRU_INACTIVE_ANON = LRU_BASE, LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, LRU_UNEVICTABLE, NR_LRU_LISTS }; #define for_each_lru(lru) for (lru = 0; lru < NR_LRU_LISTS; lru++) #define for_each_evictable_lru(lru) for (lru = 0; lru <= LRU_ACTIVE_FILE; lru++) static inline bool is_file_lru(enum lru_list lru) { return (lru == LRU_INACTIVE_FILE || lru == LRU_ACTIVE_FILE); } static inline bool is_active_lru(enum lru_list lru) { return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } #define ANON_AND_FILE 2 enum lruvec_flags { LRUVEC_CONGESTED, /* lruvec has many dirty pages * backed by a congested BDI */ }; struct lruvec { struct list_head lists[NR_LRU_LISTS]; /* * These track the cost of reclaiming one LRU - file or anon - * over the other. As the observed cost of reclaiming one LRU * increases, the reclaim scan balance tips toward the other. */ unsigned long anon_cost; unsigned long file_cost; /* Non-resident age, driven by LRU movement */ atomic_long_t nonresident_age; /* Refaults at the time of last reclaim cycle */ unsigned long refaults[ANON_AND_FILE]; /* Various lruvec state flags (enum lruvec_flags) */ unsigned long flags; #ifdef CONFIG_MEMCG struct pglist_data *pgdat; #endif }; /* Isolate unmapped pages */ #define ISOLATE_UNMAPPED ((__force isolate_mode_t)0x2) /* Isolate for asynchronous migration */ #define ISOLATE_ASYNC_MIGRATE ((__force isolate_mode_t)0x4) /* Isolate unevictable pages */ #define ISOLATE_UNEVICTABLE ((__force isolate_mode_t)0x8) /* LRU Isolation modes. */ typedef unsigned __bitwise isolate_mode_t; enum zone_watermarks { WMARK_MIN, WMARK_LOW, WMARK_HIGH, NR_WMARK }; #define min_wmark_pages(z) (z->_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) #define wmark_pages(z, i) (z->_watermark[i] + z->watermark_boost) struct per_cpu_pages { int count; /* number of pages in the list */ int high; /* high watermark, emptying needed */ int batch; /* chunk size for buddy add/remove */ /* Lists of pages, one per migrate type stored on the pcp-lists */ struct list_head lists[MIGRATE_PCPTYPES]; }; struct per_cpu_pageset { struct per_cpu_pages pcp; #ifdef CONFIG_NUMA s8 expire; u16 vm_numa_stat_diff[NR_VM_NUMA_STAT_ITEMS]; #endif #ifdef CONFIG_SMP s8 stat_threshold; s8 vm_stat_diff[NR_VM_ZONE_STAT_ITEMS]; #endif }; struct per_cpu_nodestat { s8 stat_threshold; s8 vm_node_stat_diff[NR_VM_NODE_STAT_ITEMS]; }; #endif /* !__GENERATING_BOUNDS.H */ enum zone_type { /* * ZONE_DMA and ZONE_DMA32 are used when there are peripherals not able * to DMA to all of the addressable memory (ZONE_NORMAL). * On architectures where this area covers the whole 32 bit address * space ZONE_DMA32 is used. ZONE_DMA is left for the ones with smaller * DMA addressing constraints. This distinction is important as a 32bit * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, #endif #ifdef CONFIG_ZONE_DMA32 ZONE_DMA32, #endif /* * Normal addressable memory is in ZONE_NORMAL. DMA operations can be * performed on pages in ZONE_NORMAL if the DMA devices support * transfers to all addressable memory. */ ZONE_NORMAL, #ifdef CONFIG_HIGHMEM /* * A memory area that is only addressable by the kernel through * mapping portions into its own address space. This is for example * used by i386 to allow the kernel to address the memory beyond * 900MB. The kernel will set up special mappings (page * table entries on i386) for each page that the kernel needs to * access. */ ZONE_HIGHMEM, #endif /* * ZONE_MOVABLE is similar to ZONE_NORMAL, except that it contains * movable pages with few exceptional cases described below. Main use * cases for ZONE_MOVABLE are to make memory offlining/unplug more * likely to succeed, and to locally limit unmovable allocations - e.g., * to increase the number of THP/huge pages. Notable special cases are: * * 1. Pinned pages: (long-term) pinning of movable pages might * essentially turn such pages unmovable. Memory offlining might * retry a long time. * 2. memblock allocations: kernelcore/movablecore setups might create * situations where ZONE_MOVABLE contains unmovable allocations * after boot. Memory offlining and allocations fail early. * 3. Memory holes: kernelcore/movablecore setups might create very rare * situations where ZONE_MOVABLE contains memory holes after boot, * for example, if we have sections that are only partially * populated. Memory offlining and allocations fail early. * 4. PG_hwpoison pages: while poisoned pages can be skipped during * memory offlining, such pages cannot be allocated. * 5. Unmovable PG_offline pages: in paravirtualized environments, * hotplugged memory blocks might only partially be managed by the * buddy (e.g., via XEN-balloon, Hyper-V balloon, virtio-mem). The * parts not manged by the buddy are unmovable PG_offline pages. In * some cases (virtio-mem), such pages can be skipped during * memory offlining, however, cannot be moved/allocated. These * techniques might use alloc_contig_range() to hide previously * exposed pages from the buddy again (e.g., to implement some sort * of memory unplug in virtio-mem). * * In general, no unmovable allocations that degrade memory offlining * should end up in ZONE_MOVABLE. Allocators (like alloc_contig_range()) * have to expect that migrating pages in ZONE_MOVABLE can fail (even * if has_unmovable_pages() states that there are no unmovable pages, * there can be false negatives). */ ZONE_MOVABLE, #ifdef CONFIG_ZONE_DEVICE ZONE_DEVICE, #endif __MAX_NR_ZONES }; #ifndef __GENERATING_BOUNDS_H #define ASYNC_AND_SYNC 2 struct zone { /* Read-mostly fields */ /* zone watermarks, access with *_wmark_pages(zone) macros */ unsigned long _watermark[NR_WMARK]; unsigned long watermark_boost; unsigned long nr_reserved_highatomic; /* * We don't know if the memory that we're going to allocate will be * freeable or/and it will be released eventually, so to avoid totally * wasting several GB of ram we must reserve some of the lower zone * memory (otherwise we risk to run OOM on the lower zones despite * there being tons of freeable ram on the higher zones). This array is * recalculated at runtime if the sysctl_lowmem_reserve_ratio sysctl * changes. */ long lowmem_reserve[MAX_NR_ZONES]; #ifdef CONFIG_NEED_MULTIPLE_NODES int node; #endif struct pglist_data *zone_pgdat; struct per_cpu_pageset __percpu *pageset; #ifndef CONFIG_SPARSEMEM /* * Flags for a pageblock_nr_pages block. See pageblock-flags.h. * In SPARSEMEM, this map is stored in struct mem_section */ unsigned long *pageblock_flags; #endif /* CONFIG_SPARSEMEM */ /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */ unsigned long zone_start_pfn; /* * spanned_pages is the total pages spanned by the zone, including * holes, which is calculated as: * spanned_pages = zone_end_pfn - zone_start_pfn; * * present_pages is physical pages existing within the zone, which * is calculated as: * present_pages = spanned_pages - absent_pages(pages in holes); * * managed_pages is present pages managed by the buddy system, which * is calculated as (reserved_pages includes pages allocated by the * bootmem allocator): * managed_pages = present_pages - reserved_pages; * * So present_pages may be used by memory hotplug or memory power * management logic to figure out unmanaged pages by checking * (present_pages - managed_pages). And managed_pages should be used * by page allocator and vm scanner to calculate all kinds of watermarks * and thresholds. * * Locking rules: * * zone_start_pfn and spanned_pages are protected by span_seqlock. * It is a seqlock because it has to be read outside of zone->lock, * and it is done in the main allocator path. But, it is written * quite infrequently. * * The span_seq lock is declared along with zone->lock because it is * frequently read in proximity to zone->lock. It's good to * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by * mem_hotplug_begin/end(). Any reader who can't tolerant drift of * present_pages should get_online_mems() to get a stable value. */ atomic_long_t managed_pages; unsigned long spanned_pages; unsigned long present_pages; const char *name; #ifdef CONFIG_MEMORY_ISOLATION /* * Number of isolated pageblock. It is used to solve incorrect * freepage counting problem due to racy retrieving migratetype * of pageblock. Protected by zone->lock. */ unsigned long nr_isolate_pageblock; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; #endif int initialized; /* Write-intensive fields used from the page allocator */ ZONE_PADDING(_pad1_) /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; /* zone flags, see below */ unsigned long flags; /* Primarily protects free_area */ spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ ZONE_PADDING(_pad2_) /* * When free pages are below this point, additional steps are taken * when reading the number of free pages to avoid per-cpu counter * drift allowing watermarks to be breached */ unsigned long percpu_drift_mark; #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; /* pfn where compaction migration scanner should start */ unsigned long compact_cached_migrate_pfn[ASYNC_AND_SYNC]; unsigned long compact_init_migrate_pfn; unsigned long compact_init_free_pfn; #endif #ifdef CONFIG_COMPACTION /* * On compaction failure, 1<<compact_defer_shift compactions * are skipped before trying again. The number attempted since * last failure is tracked with compact_considered. * compact_order_failed is the minimum compaction failed order. */ unsigned int compact_considered; unsigned int compact_defer_shift; int compact_order_failed; #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; #endif bool contiguous; ZONE_PADDING(_pad3_) /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; } ____cacheline_internodealigned_in_smp; enum pgdat_flags { PGDAT_DIRTY, /* reclaim scanning has recently found * many dirty file pages at the tail * of the LRU. */ PGDAT_WRITEBACK, /* reclaim scanning has recently found * many pages under writeback */ PGDAT_RECLAIM_LOCKED, /* prevents concurrent reclaim */ }; enum zone_flags { ZONE_BOOSTED_WATERMARK, /* zone recently boosted watermarks. * Cleared when kswapd is woken. */ }; static inline unsigned long zone_managed_pages(struct zone *zone) { return (unsigned long)atomic_long_read(&zone->managed_pages); } static inline unsigned long zone_end_pfn(const struct zone *zone) { return zone->zone_start_pfn + zone->spanned_pages; } static inline bool zone_spans_pfn(const struct zone *zone, unsigned long pfn) { return zone->zone_start_pfn <= pfn && pfn < zone_end_pfn(zone); } static inline bool zone_is_initialized(struct zone *zone) { return zone->initialized; } static inline bool zone_is_empty(struct zone *zone) { return zone->spanned_pages == 0; } /* * Return true if [start_pfn, start_pfn + nr_pages) range has a non-empty * intersection with the given zone */ static inline bool zone_intersects(struct zone *zone, unsigned long start_pfn, unsigned long nr_pages) { if (zone_is_empty(zone)) return false; if (start_pfn >= zone_end_pfn(zone) || start_pfn + nr_pages <= zone->zone_start_pfn) return false; return true; } /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the * queues ("queue_length >> 12") during an aging round. */ #define DEF_PRIORITY 12 /* Maximum number of zones on a zonelist */ #define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES) enum { ZONELIST_FALLBACK, /* zonelist with fallback */ #ifdef CONFIG_NUMA /* * The NUMA zonelists are doubled because we need zonelists that * restrict the allocations to a single node for __GFP_THISNODE. */ ZONELIST_NOFALLBACK, /* zonelist without fallback (__GFP_THISNODE) */ #endif MAX_ZONELISTS }; /* * This struct contains information about a zone in a zonelist. It is stored * here to avoid dereferences into large structures and lookups of tables */ struct zoneref { struct zone *zone; /* Pointer to actual zone */ int zone_idx; /* zone_idx(zoneref->zone) */ }; /* * One allocation request operates on a zonelist. A zonelist * is a list of zones, the first one is the 'goal' of the * allocation, the other zones are fallback zones, in decreasing * priority. * * To speed the reading of the zonelist, the zonerefs contain the zone index * of the entry being read. Helper functions to access information given * a struct zoneref are * * zonelist_zone() - Return the struct zone * for an entry in _zonerefs * zonelist_zone_idx() - Return the index of the zone for an entry * zonelist_node_idx() - Return the index of the node for an entry */ struct zonelist { struct zoneref _zonerefs[MAX_ZONES_PER_ZONELIST + 1]; }; #ifndef CONFIG_DISCONTIGMEM /* The array of struct pages - for discontigmem use pgdat->lmem_map */ extern struct page *mem_map; #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split { spinlock_t split_queue_lock; struct list_head split_queue; unsigned long split_queue_len; }; #endif /* * On NUMA machines, each NUMA node would have a pg_data_t to describe * it's memory layout. On UMA machines there is a single pglist_data which * describes the whole memory. * * Memory statistics and page replacement data structures are maintained on a * per-zone basis. */ typedef struct pglist_data { /* * node_zones contains just the zones for THIS node. Not all of the * zones may be populated, but it is the full list. It is referenced by * this node's node_zonelists as well as other node's node_zonelists. */ struct zone node_zones[MAX_NR_ZONES]; /* * node_zonelists contains references to all zones in all nodes. * Generally the first zones will be references to this node's * node_zones. */ struct zonelist node_zonelists[MAX_ZONELISTS]; int nr_zones; /* number of populated zones in this node */ #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; #ifdef CONFIG_PAGE_EXTENSION struct page_ext *node_page_ext; #endif #endif #if defined(CONFIG_MEMORY_HOTPLUG) || defined(CONFIG_DEFERRED_STRUCT_PAGE_INIT) /* * Must be held any time you expect node_start_pfn, * node_present_pages, node_spanned_pages or nr_zones to stay constant. * Also synchronizes pgdat->first_deferred_pfn during deferred page * init. * * pgdat_resize_lock() and pgdat_resize_unlock() are provided to * manipulate node_size_lock without checking for CONFIG_MEMORY_HOTPLUG * or CONFIG_DEFERRED_STRUCT_PAGE_INIT. * * Nests above zone->lock and zone->span_seqlock */ spinlock_t node_size_lock; #endif unsigned long node_start_pfn; unsigned long node_present_pages; /* total number of physical pages */ unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by mem_hotplug_begin/end() */ int kswapd_order; enum zone_type kswapd_highest_zoneidx; int kswapd_failures; /* Number of 'reclaimed == 0' runs */ #ifdef CONFIG_COMPACTION int kcompactd_max_order; enum zone_type kcompactd_highest_zoneidx; wait_queue_head_t kcompactd_wait; struct task_struct *kcompactd; #endif /* * This is a per-node reserve of pages that are not available * to userspace allocations. */ unsigned long totalreserve_pages; #ifdef CONFIG_NUMA /* * node reclaim becomes active if more unmapped pages exist. */ unsigned long min_unmapped_pages; unsigned long min_slab_pages; #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ ZONE_PADDING(_pad1_) spinlock_t lru_lock; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* * If memory initialisation on large machines is deferred then this * is the first PFN that needs to be initialised. */ unsigned long first_deferred_pfn; #endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif /* Fields commonly accessed by the page reclaim scanner */ /* * NOTE: THIS IS UNUSED IF MEMCG IS ENABLED. * * Use mem_cgroup_lruvec() to look up lruvecs. */ struct lruvec __lruvec; unsigned long flags; ZONE_PADDING(_pad2_) /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; atomic_long_t vm_stat[NR_VM_NODE_STAT_ITEMS]; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) #define node_spanned_pages(nid) (NODE_DATA(nid)->node_spanned_pages) #ifdef CONFIG_FLAT_NODE_MEM_MAP #define pgdat_page_nr(pgdat, pagenr) ((pgdat)->node_mem_map + (pagenr)) #else #define pgdat_page_nr(pgdat, pagenr) pfn_to_page((pgdat)->node_start_pfn + (pagenr)) #endif #define nid_page_nr(