1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NetLabel System * * The NetLabel system manages static and dynamic label mappings for network * protocols such as CIPSO and RIPSO. * * Author: Paul Moore <paul@paul-moore.com> */ /* * (c) Copyright Hewlett-Packard Development Company, L.P., 2006, 2008 */ #ifndef _NETLABEL_H #define _NETLABEL_H #include <linux/types.h> #include <linux/slab.h> #include <linux/net.h> #include <linux/skbuff.h> #include <linux/in.h> #include <linux/in6.h> #include <net/netlink.h> #include <net/request_sock.h> #include <linux/refcount.h> struct cipso_v4_doi; struct calipso_doi; /* * NetLabel - A management interface for maintaining network packet label * mapping tables for explicit packet labling protocols. * * Network protocols such as CIPSO and RIPSO require a label translation layer * to convert the label on the packet into something meaningful on the host * machine. In the current Linux implementation these mapping tables live * inside the kernel; NetLabel provides a mechanism for user space applications * to manage these mapping tables. * * NetLabel makes use of the Generic NETLINK mechanism as a transport layer to * send messages between kernel and user space. The general format of a * NetLabel message is shown below: * * +-----------------+-------------------+--------- --- -- - * | struct nlmsghdr | struct genlmsghdr | payload * +-----------------+-------------------+--------- --- -- - * * The 'nlmsghdr' and 'genlmsghdr' structs should be dealt with like normal. * The payload is dependent on the subsystem specified in the * 'nlmsghdr->nlmsg_type' and should be defined below, supporting functions * should be defined in the corresponding net/netlabel/netlabel_<subsys>.h|c * file. All of the fields in the NetLabel payload are NETLINK attributes, see * the include/net/netlink.h file for more information on NETLINK attributes. * */ /* * NetLabel NETLINK protocol */ /* NetLabel NETLINK protocol version * 1: initial version * 2: added static labels for unlabeled connections * 3: network selectors added to the NetLabel/LSM domain mapping and the * CIPSO_V4_MAP_LOCAL CIPSO mapping was added */ #define NETLBL_PROTO_VERSION 3 /* NetLabel NETLINK types/families */ #define NETLBL_NLTYPE_NONE 0 #define NETLBL_NLTYPE_MGMT 1 #define NETLBL_NLTYPE_MGMT_NAME "NLBL_MGMT" #define NETLBL_NLTYPE_RIPSO 2 #define NETLBL_NLTYPE_RIPSO_NAME "NLBL_RIPSO" #define NETLBL_NLTYPE_CIPSOV4 3 #define NETLBL_NLTYPE_CIPSOV4_NAME "NLBL_CIPSOv4" #define NETLBL_NLTYPE_CIPSOV6 4 #define NETLBL_NLTYPE_CIPSOV6_NAME "NLBL_CIPSOv6" #define NETLBL_NLTYPE_UNLABELED 5 #define NETLBL_NLTYPE_UNLABELED_NAME "NLBL_UNLBL" #define NETLBL_NLTYPE_ADDRSELECT 6 #define NETLBL_NLTYPE_ADDRSELECT_NAME "NLBL_ADRSEL" #define NETLBL_NLTYPE_CALIPSO 7 #define NETLBL_NLTYPE_CALIPSO_NAME "NLBL_CALIPSO" /* * NetLabel - Kernel API for accessing the network packet label mappings. * * The following functions are provided for use by other kernel modules, * specifically kernel LSM modules, to provide a consistent, transparent API * for dealing with explicit packet labeling protocols such as CIPSO and * RIPSO. The functions defined here are implemented in the * net/netlabel/netlabel_kapi.c file. * */ /* NetLabel audit information */ struct netlbl_audit { u32 secid; kuid_t loginuid; unsigned int sessionid; }; /* * LSM security attributes */ /** * struct netlbl_lsm_cache - NetLabel LSM security attribute cache * @refcount: atomic reference counter * @free: LSM supplied function to free the cache data * @data: LSM supplied cache data * * Description: * This structure is provided for LSMs which wish to make use of the NetLabel * caching mechanism to store LSM specific data/attributes in the NetLabel * cache. If the LSM has to perform a lot of translation from the NetLabel * security attributes into it's own internal representation then the cache * mechanism can provide a way to eliminate some or all of that translation * overhead on a cache hit. * */ struct netlbl_lsm_cache { refcount_t refcount; void (*free) (const void *data); void *data; }; /** * struct netlbl_lsm_catmap - NetLabel LSM secattr category bitmap * @startbit: the value of the lowest order bit in the bitmap * @bitmap: the category bitmap * @next: pointer to the next bitmap "node" or NULL * * Description: * This structure is used to represent category bitmaps. Due to the large * number of categories supported by most labeling protocols it is not * practical to transfer a full bitmap internally so NetLabel adopts a sparse * bitmap structure modeled after SELinux's ebitmap structure. * The catmap bitmap field MUST be a power of two in length and large * enough to hold at least 240 bits. Special care (i.e. check the code!) * should be used when changing these values as the LSM implementation * probably has functions which rely on the sizes of these types to speed * processing. * */ #define NETLBL_CATMAP_MAPTYPE u64 #define NETLBL_CATMAP_MAPCNT 4 #define NETLBL_CATMAP_MAPSIZE (sizeof(NETLBL_CATMAP_MAPTYPE) * 8) #define NETLBL_CATMAP_SIZE (NETLBL_CATMAP_MAPSIZE * \ NETLBL_CATMAP_MAPCNT) #define NETLBL_CATMAP_BIT (NETLBL_CATMAP_MAPTYPE)0x01 struct netlbl_lsm_catmap { u32 startbit; NETLBL_CATMAP_MAPTYPE bitmap[NETLBL_CATMAP_MAPCNT]; struct netlbl_lsm_catmap *next; }; /** * struct netlbl_lsm_secattr - NetLabel LSM security attributes * @flags: indicate structure attributes, see NETLBL_SECATTR_* * @type: indicate the NLTYPE of the attributes * @domain: the NetLabel LSM domain * @cache: NetLabel LSM specific cache * @attr.mls: MLS sensitivity label * @attr.mls.cat: MLS category bitmap * @attr.mls.lvl: MLS sensitivity level * @attr.secid: LSM specific secid token * * Description: * This structure is used to pass security attributes between NetLabel and the * LSM modules. The flags field is used to specify which fields within the * struct are valid and valid values can be created by bitwise OR'ing the * NETLBL_SECATTR_* defines. The domain field is typically set by the LSM to * specify domain specific configuration settings and is not usually used by * NetLabel itself when returning security attributes to the LSM. * */ struct netlbl_lsm_secattr { u32 flags; /* bitmap values for 'flags' */ #define NETLBL_SECATTR_NONE 0x00000000 #define NETLBL_SECATTR_DOMAIN 0x00000001 #define NETLBL_SECATTR_DOMAIN_CPY (NETLBL_SECATTR_DOMAIN | \ NETLBL_SECATTR_FREE_DOMAIN) #define NETLBL_SECATTR_CACHE 0x00000002 #define NETLBL_SECATTR_MLS_LVL 0x00000004 #define NETLBL_SECATTR_MLS_CAT 0x00000008 #define NETLBL_SECATTR_SECID 0x00000010 /* bitmap meta-values for 'flags' */ #define NETLBL_SECATTR_FREE_DOMAIN 0x01000000 #define NETLBL_SECATTR_CACHEABLE (NETLBL_SECATTR_MLS_LVL | \ NETLBL_SECATTR_MLS_CAT | \ NETLBL_SECATTR_SECID) u32 type; char *domain; struct netlbl_lsm_cache *cache; struct { struct { struct netlbl_lsm_catmap *cat; u32 lvl; } mls; u32 secid; } attr; }; /** * struct netlbl_calipso_ops - NetLabel CALIPSO operations * @doi_add: add a CALIPSO DOI * @doi_free: free a CALIPSO DOI * @doi_getdef: returns a reference to a DOI * @doi_putdef: releases a reference of a DOI * @doi_walk: enumerate the DOI list * @sock_getattr: retrieve the socket's attr * @sock_setattr: set the socket's attr * @sock_delattr: remove the socket's attr * @req_setattr: set the req socket's attr * @req_delattr: remove the req socket's attr * @opt_getattr: retrieve attr from memory block * @skbuff_optptr: find option in packet * @skbuff_setattr: set the skbuff's attr * @skbuff_delattr: remove the skbuff's attr * @cache_invalidate: invalidate cache * @cache_add: add cache entry * * Description: * This structure is filled out by the CALIPSO engine and passed * to the NetLabel core via a call to netlbl_calipso_ops_register(). * It enables the CALIPSO engine (and hence IPv6) to be compiled * as a module. */ struct netlbl_calipso_ops { int (*doi_add)(struct calipso_doi *doi_def, struct netlbl_audit *audit_info); void (*doi_free)(struct calipso_doi *doi_def); int (*doi_remove)(u32 doi, struct netlbl_audit *audit_info); struct calipso_doi *(*doi_getdef)(u32 doi); void (*doi_putdef)(struct calipso_doi *doi_def); int (*doi_walk)(u32 *skip_cnt, int (*callback)(struct calipso_doi *doi_def, void *arg), void *cb_arg); int (*sock_getattr)(struct sock *sk, struct netlbl_lsm_secattr *secattr); int (*sock_setattr)(struct sock *sk, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); void (*sock_delattr)(struct sock *sk); int (*req_setattr)(struct request_sock *req, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); void (*req_delattr)(struct request_sock *req); int (*opt_getattr)(const unsigned char *calipso, struct netlbl_lsm_secattr *secattr); unsigned char *(*skbuff_optptr)(const struct sk_buff *skb); int (*skbuff_setattr)(struct sk_buff *skb, const struct calipso_doi *doi_def, const struct netlbl_lsm_secattr *secattr); int (*skbuff_delattr)(struct sk_buff *skb); void (*cache_invalidate)(void); int (*cache_add)(const unsigned char *calipso_ptr, const struct netlbl_lsm_secattr *secattr); }; /* * LSM security attribute operations (inline) */ /** * netlbl_secattr_cache_alloc - Allocate and initialize a secattr cache * @flags: the memory allocation flags * * Description: * Allocate and initialize a netlbl_lsm_cache structure. Returns a pointer * on success, NULL on failure. * */ static inline struct netlbl_lsm_cache *netlbl_secattr_cache_alloc(gfp_t flags) { struct netlbl_lsm_cache *cache; cache = kzalloc(sizeof(*cache), flags); if (cache) refcount_set(&cache->refcount, 1); return cache; } /** * netlbl_secattr_cache_free - Frees a netlbl_lsm_cache struct * @cache: the struct to free * * Description: * Frees @secattr including all of the internal buffers. * */ static inline void netlbl_secattr_cache_free(struct netlbl_lsm_cache *cache) { if (!refcount_dec_and_test(&cache->refcount)) return; if (cache->free) cache->free(cache->data); kfree(cache); } /** * netlbl_catmap_alloc - Allocate a LSM secattr catmap * @flags: memory allocation flags * * Description: * Allocate memory for a LSM secattr catmap, returns a pointer on success, NULL * on failure. * */ static inline struct netlbl_lsm_catmap *netlbl_catmap_alloc(gfp_t flags) { return kzalloc(sizeof(struct netlbl_lsm_catmap), flags); } /** * netlbl_catmap_free - Free a LSM secattr catmap * @catmap: the category bitmap * * Description: * Free a LSM secattr catmap. * */ static inline void netlbl_catmap_free(struct netlbl_lsm_catmap *catmap) { struct netlbl_lsm_catmap *iter; while (catmap) { iter = catmap; catmap = catmap->next; kfree(iter); } } /** * netlbl_secattr_init - Initialize a netlbl_lsm_secattr struct * @secattr: the struct to initialize * * Description: * Initialize an already allocated netlbl_lsm_secattr struct. * */ static inline void netlbl_secattr_init(struct netlbl_lsm_secattr *secattr) { memset(secattr, 0, sizeof(*secattr)); } /** * netlbl_secattr_destroy - Clears a netlbl_lsm_secattr struct * @secattr: the struct to clear * * Description: * Destroys the @secattr struct, including freeing all of the internal buffers. * The struct must be reset with a call to netlbl_secattr_init() before reuse. * */ static inline void netlbl_secattr_destroy(struct netlbl_lsm_secattr *secattr) { if (secattr->flags & NETLBL_SECATTR_FREE_DOMAIN) kfree(secattr->domain); if (secattr->flags & NETLBL_SECATTR_CACHE) netlbl_secattr_cache_free(secattr->cache); if (secattr->flags & NETLBL_SECATTR_MLS_CAT) netlbl_catmap_free(secattr->attr.mls.cat); } /** * netlbl_secattr_alloc - Allocate and initialize a netlbl_lsm_secattr struct * @flags: the memory allocation flags * * Description: * Allocate and initialize a netlbl_lsm_secattr struct. Returns a valid * pointer on success, or NULL on failure. * */ static inline struct netlbl_lsm_secattr *netlbl_secattr_alloc(gfp_t flags) { return kzalloc(sizeof(struct netlbl_lsm_secattr), flags); } /** * netlbl_secattr_free - Frees a netlbl_lsm_secattr struct * @secattr: the struct to free * * Description: * Frees @secattr including all of the internal buffers. * */ static inline void netlbl_secattr_free(struct netlbl_lsm_secattr *secattr) { netlbl_secattr_destroy(secattr); kfree(secattr); } #ifdef CONFIG_NETLABEL /* * LSM configuration operations */ int netlbl_cfg_map_del(const char *domain, u16 family, const void *addr, const void *mask, struct netlbl_audit *audit_info); int netlbl_cfg_unlbl_map_add(const char *domain, u16 family, const void *addr, const void *mask, struct netlbl_audit *audit_info); int netlbl_cfg_unlbl_static_add(struct net *net, const char *dev_name, const void *addr, const void *mask, u16 family, u32 secid, struct netlbl_audit *audit_info); int netlbl_cfg_unlbl_static_del(struct net *net, const char *dev_name, const void *addr, const void *mask, u16 family, struct netlbl_audit *audit_info); int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info); void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info); int netlbl_cfg_cipsov4_map_add(u32 doi, const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info); int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info); void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info); int netlbl_cfg_calipso_map_add(u32 doi, const char *domain, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info); /* * LSM security attribute operations */ int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset); int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset); int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, u32 *offset, unsigned long *bitmap); int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap, u32 bit, gfp_t flags); int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap, u32 start, u32 end, gfp_t flags); int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, u32 offset, unsigned long bitmap, gfp_t flags); /* Bitmap functions */ int netlbl_bitmap_walk(const unsigned char *bitmap, u32 bitmap_len, u32 offset, u8 state); void netlbl_bitmap_setbit(unsigned char *bitmap, u32 bit, u8 state); /* * LSM protocol operations (NetLabel LSM/kernel API) */ int netlbl_enabled(void); int netlbl_sock_setattr(struct sock *sk, u16 family, const struct netlbl_lsm_secattr *secattr); void netlbl_sock_delattr(struct sock *sk); int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr); int netlbl_conn_setattr(struct sock *sk, struct sockaddr *addr, const struct netlbl_lsm_secattr *secattr); int netlbl_req_setattr(struct request_sock *req, const struct netlbl_lsm_secattr *secattr); void netlbl_req_delattr(struct request_sock *req); int netlbl_skbuff_setattr(struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr); int netlbl_skbuff_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr); void netlbl_skbuff_err(struct sk_buff *skb, u16 family, int error, int gateway); /* * LSM label mapping cache operations */ void netlbl_cache_invalidate(void); int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr); /* * Protocol engine operations */ struct audit_buffer *netlbl_audit_start(int type, struct netlbl_audit *audit_info); #else static inline int netlbl_cfg_map_del(const char *domain, u16 family, const void *addr, const void *mask, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline int netlbl_cfg_unlbl_map_add(const char *domain, u16 family, void *addr, void *mask, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline int netlbl_cfg_unlbl_static_add(struct net *net, const char *dev_name, const void *addr, const void *mask, u16 family, u32 secid, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline int netlbl_cfg_unlbl_static_del(struct net *net, const char *dev_name, const void *addr, const void *mask, u16 family, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline int netlbl_cfg_cipsov4_add(struct cipso_v4_doi *doi_def, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline void netlbl_cfg_cipsov4_del(u32 doi, struct netlbl_audit *audit_info) { return; } static inline int netlbl_cfg_cipsov4_map_add(u32 doi, const char *domain, const struct in_addr *addr, const struct in_addr *mask, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline int netlbl_cfg_calipso_add(struct calipso_doi *doi_def, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline void netlbl_cfg_calipso_del(u32 doi, struct netlbl_audit *audit_info) { return; } static inline int netlbl_cfg_calipso_map_add(u32 doi, const char *domain, const struct in6_addr *addr, const struct in6_addr *mask, struct netlbl_audit *audit_info) { return -ENOSYS; } static inline int netlbl_catmap_walk(struct netlbl_lsm_catmap *catmap, u32 offset) { return -ENOENT; } static inline int netlbl_catmap_walkrng(struct netlbl_lsm_catmap *catmap, u32 offset) { return -ENOENT; } static inline int netlbl_catmap_getlong(struct netlbl_lsm_catmap *catmap, u32 *offset, unsigned long *bitmap) { return 0; } static inline int netlbl_catmap_setbit(struct netlbl_lsm_catmap **catmap, u32 bit, gfp_t flags) { return 0; } static inline int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap, u32 start, u32 end, gfp_t flags) { return 0; } static inline int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap, u32 offset, unsigned long bitmap, gfp_t flags) { return 0; } static inline int netlbl_enabled(void) { return 0; } static inline int netlbl_sock_setattr(struct sock *sk, u16 family, const struct netlbl_lsm_secattr *secattr) { return -ENOSYS; } static inline void netlbl_sock_delattr(struct sock *sk) { } static inline int netlbl_sock_getattr(struct sock *sk, struct netlbl_lsm_secattr *secattr) { return -ENOSYS; } static inline int netlbl_conn_setattr(struct sock *sk, struct sockaddr *addr, const struct netlbl_lsm_secattr *secattr) { return -ENOSYS; } static inline int netlbl_req_setattr(struct request_sock *req, const struct netlbl_lsm_secattr *secattr) { return -ENOSYS; } static inline void netlbl_req_delattr(struct request_sock *req) { return; } static inline int netlbl_skbuff_setattr(struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { return -ENOSYS; } static inline int netlbl_skbuff_getattr(const struct sk_buff *skb, u16 family, struct netlbl_lsm_secattr *secattr) { return -ENOSYS; } static inline void netlbl_skbuff_err(struct sk_buff *skb, int error, int gateway) { return; } static inline void netlbl_cache_invalidate(void) { return; } static inline int netlbl_cache_add(const struct sk_buff *skb, u16 family, const struct netlbl_lsm_secattr *secattr) { return 0; } static inline struct audit_buffer *netlbl_audit_start(int type, struct netlbl_audit *audit_info) { return NULL; } #endif /* CONFIG_NETLABEL */ const struct netlbl_calipso_ops * netlbl_calipso_ops_register(const struct netlbl_calipso_ops *ops); #endif /* _NETLABEL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_VMACACHE_H #define __LINUX_VMACACHE_H #include <linux/sched.h> #include <linux/mm.h> static inline void vmacache_flush(struct task_struct *tsk) { memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); } extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr); #ifndef CONFIG_MMU extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, unsigned long start, unsigned long end); #endif static inline void vmacache_invalidate(struct mm_struct *mm) { mm->vmacache_seqnum++; } #endif /* __LINUX_VMACACHE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 /* SPDX-License-Identifier: GPL-2.0 */ /* * RT Mutexes: blocking mutual exclusion locks with PI support * * started by Ingo Molnar and Thomas Gleixner: * * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * This file contains the private data structure and API definitions. */ #ifndef __KERNEL_RTMUTEX_COMMON_H #define __KERNEL_RTMUTEX_COMMON_H #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * * @tree_entry: pi node to enqueue into the mutex waiters tree * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { struct rb_node tree_entry; struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES unsigned long ip; struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif int prio; u64 deadline; }; /* * Various helpers to access the waiters-tree: */ #ifdef CONFIG_RT_MUTEXES static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; if (leftmost) { w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); } return w; } static inline int task_has_pi_waiters(struct task_struct *p) { return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, pi_tree_entry); } #else static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return false; } static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { return NULL; } static inline int task_has_pi_waiters(struct task_struct *p) { return false; } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { return NULL; } #endif /* * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } /* * Constants for rt mutex functions which have a selectable deadlock * detection. * * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are * no further PI adjustments to be made. * * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full * walk of the lock chain. */ enum rtmutex_chainwalk { RT_MUTEX_MIN_CHAINWALK, RT_MUTEX_FULL_CHAINWALK, }; /* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wqh); extern void rt_mutex_postunlock(struct wake_q_head *wake_q); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" #else # include "rtmutex.h" #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1994 Linus Torvalds * * Pentium III FXSR, SSE support * General FPU state handling cleanups * Gareth Hughes <gareth@valinux.com>, May 2000 */ #include <asm/fpu/internal.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> #include <asm/traps.h> #include <asm/irq_regs.h> #include <linux/hardirq.h> #include <linux/pkeys.h> #define CREATE_TRACE_POINTS #include <asm/trace/fpu.h> /* * Represents the initial FPU state. It's mostly (but not completely) zeroes, * depending on the FPU hardware format: */ union fpregs_state init_fpstate __read_mostly; /* * Track whether the kernel is using the FPU state * currently. * * This flag is used: * * - by IRQ context code to potentially use the FPU * if it's unused. * * - to debug kernel_fpu_begin()/end() correctness */ static DEFINE_PER_CPU(bool, in_kernel_fpu); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); static bool kernel_fpu_disabled(void) { return this_cpu_read(in_kernel_fpu); } static bool interrupted_kernel_fpu_idle(void) { return !kernel_fpu_disabled(); } /* * Were we in user mode (or vm86 mode) when we were * interrupted? * * Doing kernel_fpu_begin/end() is ok if we are running * in an interrupt context from user mode - we'll just * save the FPU state as required. */ static bool interrupted_user_mode(void) { struct pt_regs *regs = get_irq_regs(); return regs && user_mode(regs); } /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? * * It's always ok in process context (ie "not interrupt") * but it is sometimes ok even from an irq. */ bool irq_fpu_usable(void) { return !in_interrupt() || interrupted_user_mode() || interrupted_kernel_fpu_idle(); } EXPORT_SYMBOL(irq_fpu_usable); /* * These must be called with preempt disabled. Returns * 'true' if the FPU state is still intact and we can * keep registers active. * * The legacy FNSAVE instruction cleared all FPU state * unconditionally, so registers are essentially destroyed. * Modern FPU state can be kept in registers, if there are * no pending FP exceptions. */ int copy_fpregs_to_fpstate(struct fpu *fpu) { if (likely(use_xsave())) { copy_xregs_to_kernel(&fpu->state.xsave); /* * AVX512 state is tracked here because its use is * known to slow the max clock speed of the core. */ if (fpu->state.xsave.header.xfeatures & XFEATURE_MASK_AVX512) fpu->avx512_timestamp = jiffies; return 1; } if (likely(use_fxsr())) { copy_fxregs_to_kernel(fpu); return 1; } /* * Legacy FPU register saving, FNSAVE always clears FPU registers, * so we have to mark them inactive: */ asm volatile("fnsave %[fp]; fwait" : [fp] "=m" (fpu->state.fsave)); return 0; } EXPORT_SYMBOL(copy_fpregs_to_fpstate); void kernel_fpu_begin_mask(unsigned int kfpu_mask) { preempt_disable(); WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); if (!(current->flags & PF_KTHREAD) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); /* * Ignore return value -- we don't care if reg state * is clobbered. */ copy_fpregs_to_fpstate(&current->thread.fpu); } __cpu_invalidate_fpregs_state(); /* Put sane initial values into the control registers. */ if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM)) ldmxcsr(MXCSR_DEFAULT); if (unlikely(kfpu_mask & KFPU_387) && boot_cpu_has(X86_FEATURE_FPU)) asm volatile ("fninit"); } EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, false); preempt_enable(); } EXPORT_SYMBOL_GPL(kernel_fpu_end); /* * Save the FPU state (mark it for reload if necessary): * * This only ever gets called for the current task. */ void fpu__save(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); fpregs_lock(); trace_x86_fpu_before_save(fpu); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) { if (!copy_fpregs_to_fpstate(fpu)) { copy_kernel_to_fpregs(&fpu->state); } } trace_x86_fpu_after_save(fpu); fpregs_unlock(); } /* * Legacy x87 fpstate state init: */ static inline void fpstate_init_fstate(struct fregs_state *fp) { fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; fp->twd = 0xffffffffu; fp->fos = 0xffff0000u; } void fpstate_init(union fpregs_state *state) { if (!static_cpu_has(X86_FEATURE_FPU)) { fpstate_init_soft(&state->soft); return; } memset(state, 0, fpu_kernel_xstate_size); if (static_cpu_has(X86_FEATURE_XSAVES)) fpstate_init_xstate(&state->xsave); if (static_cpu_has(X86_FEATURE_FXSR)) fpstate_init_fxstate(&state->fxsave); else fpstate_init_fstate(&state->fsave); } EXPORT_SYMBOL_GPL(fpstate_init); int fpu__copy(struct task_struct *dst, struct task_struct *src) { struct fpu *dst_fpu = &dst->thread.fpu; struct fpu *src_fpu = &src->thread.fpu; dst_fpu->last_cpu = -1; if (!static_cpu_has(X86_FEATURE_FPU)) return 0; WARN_ON_FPU(src_fpu != &current->thread.fpu); /* * Don't let 'init optimized' areas of the XSAVE area * leak into the child task: */ memset(&dst_fpu->state.xsave, 0, fpu_kernel_xstate_size); /* * If the FPU registers are not current just memcpy() the state. * Otherwise save current FPU registers directly into the child's FPU * context, without any memory-to-memory copying. * * ( The function 'fails' in the FNSAVE case, which destroys * register contents so we have to load them back. ) */ fpregs_lock(); if (test_thread_flag(TIF_NEED_FPU_LOAD)) memcpy(&dst_fpu->state, &src_fpu->state, fpu_kernel_xstate_size); else if (!copy_fpregs_to_fpstate(dst_fpu)) copy_kernel_to_fpregs(&dst_fpu->state); fpregs_unlock(); set_tsk_thread_flag(dst, TIF_NEED_FPU_LOAD); trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* * Activate the current task's in-memory FPU context, * if it has not been used before: */ static void fpu__initialize(struct fpu *fpu) { WARN_ON_FPU(fpu != &current->thread.fpu); set_thread_flag(TIF_NEED_FPU_LOAD); fpstate_init(&fpu->state); trace_x86_fpu_init_state(fpu); } /* * This function must be called before we read a task's fpstate. * * There's two cases where this gets called: * * - for the current task (when coredumping), in which case we have * to save the latest FPU registers into the fpstate, * * - or it's called for stopped tasks (ptrace), in which case the * registers were already saved by the context-switch code when * the task scheduled out. * * If the task has used the FPU before then save it. */ void fpu__prepare_read(struct fpu *fpu) { if (fpu == &current->thread.fpu) fpu__save(fpu); } /* * This function must be called before we write a task's fpstate. * * Invalidate any cached FPU registers. * * After this function call, after registers in the fpstate are * modified and the child task has woken up, the child task will * restore the modified FPU state from the modified context. If we * didn't clear its cached status here then the cached in-registers * state pending on its former CPU could be restored, corrupting * the modifications. */ void fpu__prepare_write(struct fpu *fpu) { /* * Only stopped child tasks can be used to modify the FPU * state in the fpstate buffer: */ WARN_ON_FPU(fpu == &current->thread.fpu); /* Invalidate any cached state: */ __fpu_invalidate_fpregs_state(fpu); } /* * Drops current FPU state: deactivates the fpregs and * the fpstate. NOTE: it still leaves previous contents * in the fpregs in the eager-FPU case. * * This function can be used in cases where we know that * a state-restore is coming: either an explicit one, * or a reschedule. */ void fpu__drop(struct fpu *fpu) { preempt_disable(); if (fpu == &current->thread.fpu) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" _ASM_EXTABLE(1b, 2b)); fpregs_deactivate(fpu); } trace_x86_fpu_dropped(fpu); preempt_enable(); } /* * Clear FPU registers by setting them up from the init fpstate. * Caller must do fpregs_[un]lock() around it. */ static inline void copy_init_fpstate_to_fpregs(u64 features_mask) { if (use_xsave()) copy_kernel_to_xregs(&init_fpstate.xsave, features_mask); else if (static_cpu_has(X86_FEATURE_FXSR)) copy_kernel_to_fxregs(&init_fpstate.fxsave); else copy_kernel_to_fregs(&init_fpstate.fsave); if (boot_cpu_has(X86_FEATURE_OSPKE)) copy_init_pkru_to_fpregs(); } /* * Clear the FPU state back to init state. * * Called by sys_execve(), by the signal handler code and by various * error paths. */ static void fpu__clear(struct fpu *fpu, bool user_only) { WARN_ON_FPU(fpu != &current->thread.fpu); if (!static_cpu_has(X86_FEATURE_FPU)) { fpu__drop(fpu); fpu__initialize(fpu); return; } fpregs_lock(); if (user_only) { if (!fpregs_state_valid(fpu, smp_processor_id()) && xfeatures_mask_supervisor()) copy_kernel_to_xregs(&fpu->state.xsave, xfeatures_mask_supervisor()); copy_init_fpstate_to_fpregs(xfeatures_mask_user()); } else { copy_init_fpstate_to_fpregs(xfeatures_mask_all); } fpregs_mark_activate(); fpregs_unlock(); } void fpu__clear_user_states(struct fpu *fpu) { fpu__clear(fpu, true); } void fpu__clear_all(struct fpu *fpu) { fpu__clear(fpu, false); } /* * Load FPU context before returning to userspace. */ void switch_fpu_return(void) { if (!static_cpu_has(X86_FEATURE_FPU)) return; __fpregs_load_activate(); } EXPORT_SYMBOL_GPL(switch_fpu_return); #ifdef CONFIG_X86_DEBUG_FPU /* * If current FPU state according to its tracking (loaded FPU context on this * CPU) is not valid then we must have TIF_NEED_FPU_LOAD set so the context is * loaded on return to userland. */ void fpregs_assert_state_consistent(void) { struct fpu *fpu = &current->thread.fpu; if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; WARN_ON_FPU(!fpregs_state_valid(fpu, smp_processor_id())); } EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); #endif void fpregs_mark_activate(void) { struct fpu *fpu = &current->thread.fpu; fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); clear_thread_flag(TIF_NEED_FPU_LOAD); } EXPORT_SYMBOL_GPL(fpregs_mark_activate); /* * x87 math exception handling: */ int fpu__exception_code(struct fpu *fpu, int trap_nr) { int err; if (trap_nr == X86_TRAP_MF) { unsigned short cwd, swd; /* * (~cwd & swd) will mask out exceptions that are not set to unmasked * status. 0x3f is the exception bits in these regs, 0x200 is the * C1 reg you need in case of a stack fault, 0x040 is the stack * fault bit. We should only be taking one exception at a time, * so if this combination doesn't produce any single exception, * then we have a bad program that isn't synchronizing its FPU usage * and it will suffer the consequences since we won't be able to * fully reproduce the context of the exception. */ if (boot_cpu_has(X86_FEATURE_FXSR)) { cwd = fpu->state.fxsave.cwd; swd = fpu->state.fxsave.swd; } else { cwd = (unsigned short)fpu->state.fsave.cwd; swd = (unsigned short)fpu->state.fsave.swd; } err = swd & ~cwd; } else { /* * The SIMD FPU exceptions are handled a little differently, as there * is only a single status/control register. Thus, to determine which * unmasked exception was caught we must mask the exception mask bits * at 0x1f80, and then use these to mask the exception bits at 0x3f. */ unsigned short mxcsr = MXCSR_DEFAULT; if (boot_cpu_has(X86_FEATURE_XMM)) mxcsr = fpu->state.fxsave.mxcsr; err = ~(mxcsr >> 7) & mxcsr; } if (err & 0x001) { /* Invalid op */ /* * swd & 0x240 == 0x040: Stack Underflow * swd & 0x240 == 0x240: Stack Overflow * User must clear the SF bit (0x40) if set */ return FPE_FLTINV; } else if (err & 0x004) { /* Divide by Zero */ return FPE_FLTDIV; } else if (err & 0x008) { /* Overflow */ return FPE_FLTOVF; } else if (err & 0x012) { /* Denormal, Underflow */ return FPE_FLTUND; } else if (err & 0x020) { /* Precision */ return FPE_FLTRES; } /* * If we're using IRQ 13, or supposedly even some trap * X86_TRAP_MF implementations, it's possible * we get a spurious trap, which is not an error. */ return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_TIMERQUEUE_H #define _LINUX_TIMERQUEUE_H #include <linux/rbtree.h> #include <linux/ktime.h> struct timerqueue_node { struct rb_node node; ktime_t expires; }; struct timerqueue_head { struct rb_root_cached rb_root; }; extern bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); extern bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); extern struct timerqueue_node *timerqueue_iterate_next( struct timerqueue_node *node); /** * timerqueue_getnext - Returns the timer with the earliest expiration time * * @head: head of timerqueue * * Returns a pointer to the timer node that has the earliest expiration time. */ static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); return rb_entry(leftmost, struct timerqueue_node, node); } static inline void timerqueue_init(struct timerqueue_node *node) { RB_CLEAR_NODE(&node->node); } static inline bool timerqueue_node_queued(struct timerqueue_node *node) { return !RB_EMPTY_NODE(&node->node); } static inline bool timerqueue_node_expires(struct timerqueue_node *node) { return node->expires; } static inline void timerqueue_init_head(struct timerqueue_head *head) { head->rb_root = RB_ROOT_CACHED; } #endif /* _LINUX_TIMERQUEUE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * PTP 1588 clock support - private declarations for the core module. * * Copyright (C) 2010 OMICRON electronics GmbH */ #ifndef _PTP_PRIVATE_H_ #define _PTP_PRIVATE_H_ #include <linux/cdev.h> #include <linux/device.h> #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/posix-clock.h> #include <linux/ptp_clock.h> #include <linux/ptp_clock_kernel.h> #include <linux/time.h> #define PTP_MAX_TIMESTAMPS 128 #define PTP_BUF_TIMESTAMPS 30 struct timestamp_event_queue { struct ptp_extts_event buf[PTP_MAX_TIMESTAMPS]; int head; int tail; spinlock_t lock; }; struct ptp_clock { struct posix_clock clock; struct device dev; struct ptp_clock_info *info; dev_t devid; int index; /* index into clocks.map */ struct pps_device *pps_source; long dialed_frequency; /* remembers the frequency adjustment */ struct timestamp_event_queue tsevq; /* simple fifo for time stamps */ struct mutex tsevq_mux; /* one process at a time reading the fifo */ struct mutex pincfg_mux; /* protect concurrent info->pin_config access */ wait_queue_head_t tsev_wq; int defunct; /* tells readers to go away when clock is being removed */ struct device_attribute *pin_dev_attr; struct attribute **pin_attr; struct attribute_group pin_attr_group; /* 1st entry is a pointer to the real group, 2nd is NULL terminator */ const struct attribute_group *pin_attr_groups[2]; struct kthread_worker *kworker; struct kthread_delayed_work aux_work; }; /* * The function queue_cnt() is safe for readers to call without * holding q->lock. Readers use this function to verify that the queue * is nonempty before proceeding with a dequeue operation. The fact * that a writer might concurrently increment the tail does not * matter, since the queue remains nonempty nonetheless. */ static inline int queue_cnt(struct timestamp_event_queue *q) { int cnt = q->tail - q->head; return cnt < 0 ? PTP_MAX_TIMESTAMPS + cnt : cnt; } /* * see ptp_chardev.c */ /* caller must hold pincfg_mux */ int ptp_set_pinfunc(struct ptp_clock *ptp, unsigned int pin, enum ptp_pin_function func, unsigned int chan); long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg); int ptp_open(struct posix_clock *pc, fmode_t fmode); ssize_t ptp_read(struct posix_clock *pc, uint flags, char __user *buf, size_t cnt); __poll_t ptp_poll(struct posix_clock *pc, struct file *fp, poll_table *wait); /* * see ptp_sysfs.c */ extern const struct attribute_group *ptp_groups[]; int ptp_populate_pin_groups(struct ptp_clock *ptp); void ptp_cleanup_pin_groups(struct ptp_clock *ptp); #endif
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 // SPDX-License-Identifier: GPL-2.0-only /* * fs/dcache.c * * Complete reimplementation * (C) 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ /* * Notes on the allocation strategy: * * The dcache is a master of the icache - whenever a dcache entry * exists, the inode will always exist. "iput()" is done either when * the dcache entry is deleted or garbage collected. */ #include <linux/ratelimit.h> #include <linux/string.h> #include <linux/mm.h> #include <linux/fs.h> #include <linux/fscrypt.h> #include <linux/fsnotify.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> #include <linux/cache.h> #include <linux/export.h> #include <linux/security.h> #include <linux/seqlock.h> #include <linux/memblock.h> #include <linux/bit_spinlock.h> #include <linux/rculist_bl.h> #include <linux/list_lru.h> #include "internal.h" #include "mount.h" /* * Usage: * dcache->d_inode->i_lock protects: * - i_dentry, d_u.d_alias, d_inode of aliases * dcache_hash_bucket lock protects: * - the dcache hash table * s_roots bl list spinlock protects: * - the s_roots list (see __d_drop) * dentry->d_sb->s_dentry_lru_lock protects: * - the dcache lru lists and counters * d_lock protects: * - d_flags * - d_name * - d_lru * - d_count * - d_unhashed() * - d_parent and d_subdirs * - childrens' d_child and d_parent * - d_u.d_alias, d_inode * * Ordering: * dentry->d_inode->i_lock * dentry->d_lock * dentry->d_sb->s_dentry_lru_lock * dcache_hash_bucket lock * s_roots lock * * If there is an ancestor relationship: * dentry->d_parent->...->d_parent->d_lock * ... * dentry->d_parent->d_lock * dentry->d_lock * * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); static struct kmem_cache *dentry_cache __read_mostly; const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); /* * This is the single most critical data structure when it comes * to the dcache: the hashtable for lookups. Somebody should try * to make this good - I've just made it work. * * This hash-function tries to avoid losing too many bits of hash * information, yet avoid using a prime hash-size or similar. */ static unsigned int d_hash_shift __read_mostly; static struct hlist_bl_head *dentry_hashtable __read_mostly; static inline struct hlist_bl_head *d_hash(unsigned int hash) { return dentry_hashtable + (hash >> d_hash_shift); } #define IN_LOOKUP_SHIFT 10 static struct hlist_bl_head in_lookup_hashtable[1 << IN_LOOKUP_SHIFT]; static inline struct hlist_bl_head *in_lookup_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; return in_lookup_hashtable + hash_32(hash, IN_LOOKUP_SHIFT); } /* Statistics gathering. */ struct dentry_stat_t dentry_stat = { .age_limit = 45, }; static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_negative); #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS) /* * Here we resort to our own counters instead of using generic per-cpu counters * for consistency with what the vfs inode code does. We are expected to harvest * better code and performance by having our own specialized counters. * * Please note that the loop is done over all possible CPUs, not over all online * CPUs. The reason for this is that we don't want to play games with CPUs going * on and off. If one of them goes off, we will just keep their counters. * * glommer: See cffbc8a for details, and if you ever intend to change this, * please update all vfs counters to match. */ static long get_nr_dentry(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_unused(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_unused, i); return sum < 0 ? 0 : sum; } static long get_nr_dentry_negative(void) { int i; long sum = 0; for_each_possible_cpu(i) sum += per_cpu(nr_dentry_negative, i); return sum < 0 ? 0 : sum; } int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { dentry_stat.nr_dentry = get_nr_dentry(); dentry_stat.nr_unused = get_nr_dentry_unused(); dentry_stat.nr_negative = get_nr_dentry_negative(); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } #endif /* * Compare 2 name strings, return 0 if they match, otherwise non-zero. * The strings are both count bytes long, and count is non-zero. */ #ifdef CONFIG_DCACHE_WORD_ACCESS #include <asm/word-at-a-time.h> /* * NOTE! 'cs' and 'scount' come from a dentry, so it has a * aligned allocation for this particular component. We don't * strictly need the load_unaligned_zeropad() safety, but it * doesn't hurt either. * * In contrast, 'ct' and 'tcount' can be from a pathname, and do * need the careful unaligned handling. */ static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { unsigned long a,b,mask; for (;;) { a = read_word_at_a_time(cs); b = load_unaligned_zeropad(ct); if (tcount < sizeof(unsigned long)) break; if (unlikely(a != b)) return 1; cs += sizeof(unsigned long); ct += sizeof(unsigned long); tcount -= sizeof(unsigned long); if (!tcount) return 0; } mask = bytemask_from_count(tcount); return unlikely(!!((a ^ b) & mask)); } #else static inline int dentry_string_cmp(const unsigned char *cs, const unsigned char *ct, unsigned tcount) { do { if (*cs != *ct) return 1; cs++; ct++; tcount--; } while (tcount); return 0; } #endif static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *ct, unsigned tcount) { /* * Be careful about RCU walk racing with rename: * use 'READ_ONCE' to fetch the name pointer. * * NOTE! Even if a rename will mean that the length * was not loaded atomically, we don't care. The * RCU walk will check the sequence count eventually, * and catch it. And we won't overrun the buffer, * because we're reading the name pointer atomically, * and a dentry name is guaranteed to be properly * terminated with a NUL byte. * * End result: even if 'len' is wrong, we'll exit * early because the data cannot match (there can * be no NUL in the ct/tcount data) */ const unsigned char *cs = READ_ONCE(dentry->d_name.name); return dentry_string_cmp(cs, ct, tcount); } struct external_name { union { atomic_t count; struct rcu_head head; } u; unsigned char name[]; }; static inline struct external_name *external_name(struct dentry *dentry) { return container_of(dentry->d_name.name, struct external_name, name[0]); } static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kmem_cache_free(dentry_cache, dentry); } static void __d_free_external(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); } static inline int dname_external(const struct dentry *dentry) { return dentry->d_name.name != dentry->d_iname; } void take_dentry_name_snapshot(struct name_snapshot *name, struct dentry *dentry) { spin_lock(&dentry->d_lock); name->name = dentry->d_name; if (unlikely(dname_external(dentry))) { atomic_inc(&external_name(dentry)->u.count); } else { memcpy(name->inline_name, dentry->d_iname, dentry->d_name.len + 1); name->name.name = name->inline_name; } spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(take_dentry_name_snapshot); void release_dentry_name_snapshot(struct name_snapshot *name) { if (unlikely(name->name.name != name->inline_name)) { struct external_name *p; p = container_of(name->name.name, struct external_name, name[0]); if (unlikely(atomic_dec_and_test(&p->u.count))) kfree_rcu(p, u.head); } } EXPORT_SYMBOL(release_dentry_name_snapshot); static inline void __d_set_inode_and_type(struct dentry *dentry, struct inode *inode, unsigned type_flags) { unsigned flags; dentry->d_inode = inode; flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); flags |= type_flags; smp_store_release(&dentry->d_flags, flags); } static inline void __d_clear_type_and_inode(struct dentry *dentry) { unsigned flags = READ_ONCE(dentry->d_flags); flags &= ~(DCACHE_ENTRY_TYPE | DCACHE_FALLTHRU); WRITE_ONCE(dentry->d_flags, flags); dentry->d_inode = NULL; if (dentry->d_flags & DCACHE_LRU_LIST) this_cpu_inc(nr_dentry_negative); } static void dentry_free(struct dentry *dentry) { WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); if (unlikely(dname_external(dentry))) { struct external_name *p = external_name(dentry); if (likely(atomic_dec_and_test(&p->u.count))) { call_rcu(&dentry->d_u.d_rcu, __d_free_external); return; } } /* if dentry was never visible to RCU, immediate free is OK */ if (dentry->d_flags & DCACHE_NORCU) __d_free(&dentry->d_u.d_rcu); else call_rcu(&dentry->d_u.d_rcu, __d_free); } /* * Release the dentry's inode, using the filesystem * d_iput() operation if defined. */ static void dentry_unlink_inode(struct dentry * dentry) __releases(dentry->d_lock) __releases(dentry->d_inode->i_lock) { struct inode *inode = dentry->d_inode; raw_write_seqcount_begin(&dentry->d_seq); __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); raw_write_seqcount_end(&dentry->d_seq); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) fsnotify_inoderemove(inode); if (dentry->d_op && dentry->d_op->d_iput) dentry->d_op->d_iput(dentry, inode); else iput(inode); } /* * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry * is in use - which includes both the "real" per-superblock * LRU list _and_ the DCACHE_SHRINK_LIST use. * * The DCACHE_SHRINK_LIST bit is set whenever the dentry is * on the shrink list (ie not on the superblock LRU list). * * The per-cpu "nr_dentry_unused" counters are updated with * the DCACHE_LRU_LIST bit. * * The per-cpu "nr_dentry_negative" counters are only updated * when deleted from or added to the per-superblock LRU list, not * from/to the shrink list. That is to avoid an unneeded dec/inc * pair when moving from LRU to shrink list in select_collect(). * * These helper functions make sure we always follow the * rules. d_lock must be held by the caller. */ #define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x)) static void d_lru_add(struct dentry *dentry) { D_FLAG_VERIFY(dentry, 0); dentry->d_flags |= DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_inc(nr_dentry_negative); WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_lru_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru)); } static void d_shrink_del(struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); list_del_init(&dentry->d_lru); dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST); this_cpu_dec(nr_dentry_unused); } static void d_shrink_add(struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, 0); list_add(&dentry->d_lru, list); dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST; this_cpu_inc(nr_dentry_unused); } /* * These can only be called under the global LRU lock, ie during the * callback for freeing the LRU list. "isolate" removes it from the * LRU lists entirely, while shrink_move moves it to the indicated * private list. */ static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags &= ~DCACHE_LRU_LIST; this_cpu_dec(nr_dentry_unused); if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate(lru, &dentry->d_lru); } static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry, struct list_head *list) { D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST); dentry->d_flags |= DCACHE_SHRINK_LIST; if (d_is_negative(dentry)) this_cpu_dec(nr_dentry_negative); list_lru_isolate_move(lru, &dentry->d_lru, list); } /** * d_drop - drop a dentry * @dentry: dentry to drop * * d_drop() unhashes the entry from the parent dentry hashes, so that it won't * be found through a VFS lookup any more. Note that this is different from * deleting the dentry - d_delete will try to mark the dentry negative if * possible, giving a successful _negative_ lookup, while d_drop will * just make the cache lookup fail. * * d_drop() is used mainly for stuff that wants to invalidate a dentry for some * reason (NFS timeouts or autofs deletes). * * __d_drop requires dentry->d_lock * ___d_drop doesn't mark dentry as "unhashed" * (dentry->d_hash.pprev will be LIST_POISON2, not NULL). */ static void ___d_drop(struct dentry *dentry) { struct hlist_bl_head *b; /* * Hashed dentries are normally on the dentry hashtable, * with the exception of those newly allocated by * d_obtain_root, which are always IS_ROOT: */ if (unlikely(IS_ROOT(dentry))) b = &dentry->d_sb->s_roots; else b = d_hash(dentry->d_name.hash); hlist_bl_lock(b); __hlist_bl_del(&dentry->d_hash); hlist_bl_unlock(b); } void __d_drop(struct dentry *dentry) { if (!d_unhashed(dentry)) { ___d_drop(dentry); dentry->d_hash.pprev = NULL; write_seqcount_invalidate(&dentry->d_seq); } } EXPORT_SYMBOL(__d_drop); void d_drop(struct dentry *dentry) { spin_lock(&dentry->d_lock); __d_drop(dentry); spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_drop); static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) { struct dentry *next; /* * Inform d_walk() and shrink_dentry_list() that we are no longer * attached to the dentry tree */ dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(list_empty(&dentry->d_child))) return; __list_del_entry(&dentry->d_child); /* * Cursors can move around the list of children. While we'd been * a normal list member, it didn't matter - ->d_child.next would've * been updated. However, from now on it won't be and for the * things like d_walk() it might end up with a nasty surprise. * Normally d_walk() doesn't care about cursors moving around - * ->d_lock on parent prevents that and since a cursor has no children * of its own, we get through it without ever unlocking the parent. * There is one exception, though - if we ascend from a child that * gets killed as soon as we unlock it, the next sibling is found * using the value left in its ->d_child.next. And if _that_ * pointed to a cursor, and cursor got moved (e.g. by lseek()) * before d_walk() regains parent->d_lock, we'll end up skipping * everything the cursor had been moved past. * * Solution: make sure that the pointer left behind in ->d_child.next * points to something that won't be moving around. I.e. skip the * cursors. */ while (dentry->d_child.next != &parent->d_subdirs) { next = list_entry(dentry->d_child.next, struct dentry, d_child); if (likely(!(next->d_flags & DCACHE_DENTRY_CURSOR))) break; dentry->d_child.next = next->d_child.next; } } static void __dentry_kill(struct dentry *dentry) { struct dentry *parent = NULL; bool can_free = true; if (!IS_ROOT(dentry)) parent = dentry->d_parent; /* * The dentry is now unrecoverably dead to the world. */ lockref_mark_dead(&dentry->d_lockref); /* * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) d_lru_del(dentry); } /* if it was on the hash then remove it */ __d_drop(dentry); dentry_unlist(dentry, parent); if (parent) spin_unlock(&parent->d_lock); if (dentry->d_inode) dentry_unlink_inode(dentry); else spin_unlock(&dentry->d_lock); this_cpu_dec(nr_dentry); if (dentry->d_op && dentry->d_op->d_release) dentry->d_op->d_release(dentry); spin_lock(&dentry->d_lock); if (dentry->d_flags & DCACHE_SHRINK_LIST) { dentry->d_flags |= DCACHE_MAY_FREE; can_free = false; } spin_unlock(&dentry->d_lock); if (likely(can_free)) dentry_free(dentry); cond_resched(); } static struct dentry *__lock_parent(struct dentry *dentry) { struct dentry *parent; rcu_read_lock(); spin_unlock(&dentry->d_lock); again: parent = READ_ONCE(dentry->d_parent); spin_lock(&parent->d_lock); /* * We can't blindly lock dentry until we are sure * that we won't violate the locking order. * Any changes of dentry->d_parent must have * been done with parent->d_lock held, so * spin_lock() above is enough of a barrier * for checking if it's still our child. */ if (unlikely(parent != dentry->d_parent)) { spin_unlock(&parent->d_lock); goto again; } rcu_read_unlock(); if (parent != dentry) spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); else parent = NULL; return parent; } static inline struct dentry *lock_parent(struct dentry *dentry) { struct dentry *parent = dentry->d_parent; if (IS_ROOT(dentry)) return NULL; if (likely(spin_trylock(&parent->d_lock))) return parent; return __lock_parent(dentry); } static inline bool retain_dentry(struct dentry *dentry) { WARN_ON(d_in_lookup(dentry)); /* Unreachable? Get rid of it */ if (unlikely(d_unhashed(dentry))) return false; if (unlikely(dentry->d_flags & DCACHE_DISCONNECTED)) return false; if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) { if (dentry->d_op->d_delete(dentry)) return false; } if (unlikely(dentry->d_flags & DCACHE_DONTCACHE)) return false; /* retain; LRU fodder */ dentry->d_lockref.count--; if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) d_lru_add(dentry); else if (unlikely(!(dentry->d_flags & DCACHE_REFERENCED))) dentry->d_flags |= DCACHE_REFERENCED; return true; } void d_mark_dontcache(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); hlist_for_each_entry(de, &inode->i_dentry, d_u.d_alias) { spin_lock(&de->d_lock); de->d_flags |= DCACHE_DONTCACHE; spin_unlock(&de->d_lock); } inode->i_state |= I_DONTCACHE; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_mark_dontcache); /* * Finish off a dentry we've decided to kill. * dentry->d_lock must be held, returns with it unlocked. * Returns dentry requiring refcount drop, or NULL if we're done. */ static struct dentry *dentry_kill(struct dentry *dentry) __releases(dentry->d_lock) { struct inode *inode = dentry->d_inode; struct dentry *parent = NULL; if (inode && unlikely(!spin_trylock(&inode->i_lock))) goto slow_positive; if (!IS_ROOT(dentry)) { parent = dentry->d_parent; if (unlikely(!spin_trylock(&parent->d_lock))) { parent = __lock_parent(dentry); if (likely(inode || !dentry->d_inode)) goto got_locks; /* negative that became positive */ if (parent) spin_unlock(&parent->d_lock); inode = dentry->d_inode; goto slow_positive; } } __dentry_kill(dentry); return parent; slow_positive: spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); parent = lock_parent(dentry); got_locks: if (unlikely(dentry->d_lockref.count != 1)) { dentry->d_lockref.count--; } else if (likely(!retain_dentry(dentry))) { __dentry_kill(dentry); return parent; } /* we are keeping it, after all */ if (inode) spin_unlock(&inode->i_lock); if (parent) spin_unlock(&parent->d_lock); spin_unlock(&dentry->d_lock); return NULL; } /* * Try to do a lockless dput(), and return whether that was successful. * * If unsuccessful, we return false, having already taken the dentry lock. * * The caller needs to hold the RCU read lock, so that the dentry is * guaranteed to stay around even if the refcount goes down to zero! */ static inline bool fast_dput(struct dentry *dentry) { int ret; unsigned int d_flags; /* * If we have a d_op->d_delete() operation, we sould not * let the dentry count go to zero, so use "put_or_lock". */ if (unlikely(dentry->d_flags & DCACHE_OP_DELETE)) return lockref_put_or_lock(&dentry->d_lockref); /* * .. otherwise, we can try to just decrement the * lockref optimistically. */ ret = lockref_put_return(&dentry->d_lockref); /* * If the lockref_put_return() failed due to the lock being held * by somebody else, the fast path has failed. We will need to * get the lock, and then check the count again. */ if (unlikely(ret < 0)) { spin_lock(&dentry->d_lock); if (dentry->d_lockref.count > 1) { dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); return true; } return false; } /* * If we weren't the last ref, we're done. */ if (ret) return true; /* * Careful, careful. The reference count went down * to zero, but we don't hold the dentry lock, so * somebody else could get it again, and do another * dput(), and we need to not race with that. * * However, there is a very special and common case * where we don't care, because there is nothing to * do: the dentry is still hashed, it does not have * a 'delete' op, and it's referenced and already on * the LRU list. * * NOTE! Since we aren't locked, these values are * not "stable". However, it is sufficient that at * some point after we dropped the reference the * dentry was hashed and the flags had the proper * value. Other dentry users may have re-gotten * a reference to the dentry and change that, but * our work is done - we can leave the dentry * around with a zero refcount. */ smp_rmb(); d_flags = READ_ONCE(dentry->d_flags); d_flags &= DCACHE_REFERENCED | DCACHE_LRU_LIST | DCACHE_DISCONNECTED; /* Nothing to do? Dropping the reference was all we needed? */ if (d_flags == (DCACHE_REFERENCED | DCACHE_LRU_LIST) && !d_unhashed(dentry)) return true; /* * Not the fast normal case? Get the lock. We've already decremented * the refcount, but we'll need to re-check the situation after * getting the lock. */ spin_lock(&dentry->d_lock); /* * Did somebody else grab a reference to it in the meantime, and * we're no longer the last user after all? Alternatively, somebody * else could have killed it and marked it dead. Either way, we * don't need to do anything else. */ if (dentry->d_lockref.count) { spin_unlock(&dentry->d_lock); return true; } /* * Re-get the reference we optimistically dropped. We hold the * lock, and we just tested that it was zero, so we can just * set it to 1. */ dentry->d_lockref.count = 1; return false; } /* * This is dput * * This is complicated by the fact that we do not want to put * dentries that are no longer on any hash chain on the unused * list: we'd much rather just get rid of them immediately. * * However, that implies that we have to traverse the dentry * tree upwards to the parents which might _also_ now be * scheduled for deletion (it may have been only waiting for * its last child to go away). * * This tail recursion is done by hand as we don't want to depend * on the compiler to always get this right (gcc generally doesn't). * Real recursion would eat up our stack space. */ /* * dput - release a dentry * @dentry: dentry to release * * Release a dentry. This will drop the usage count and if appropriate * call the dentry unlink method as well as removing it from the queues and * releasing its resources. If the parent dentries were scheduled for release * they too may now get deleted. */ void dput(struct dentry *dentry) { while (dentry) { might_sleep(); rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } /* Slow case: now with the dentry lock held */ rcu_read_unlock(); if (likely(retain_dentry(dentry))) { spin_unlock(&dentry->d_lock); return; } dentry = dentry_kill(dentry); } } EXPORT_SYMBOL(dput); static void __dput_to_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { if (dentry->d_flags & DCACHE_SHRINK_LIST) { /* let the owner of the list it's on deal with it */ --dentry->d_lockref.count; } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!--dentry->d_lockref.count) d_shrink_add(dentry, list); } } void dput_to_list(struct dentry *dentry, struct list_head *list) { rcu_read_lock(); if (likely(fast_dput(dentry))) { rcu_read_unlock(); return; } rcu_read_unlock(); if (!retain_dentry(dentry)) __dput_to_list(dentry, list); spin_unlock(&dentry->d_lock); } /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) { dentry->d_lockref.count++; } static inline void __dget(struct dentry *dentry) { lockref_get(&dentry->d_lockref); } struct dentry *dget_parent(struct dentry *dentry) { int gotref; struct dentry *ret; unsigned seq; /* * Do optimistic parent lookup without any * locking. */ rcu_read_lock(); seq = raw_seqcount_begin(&dentry->d_seq); ret = READ_ONCE(dentry->d_parent); gotref = lockref_get_not_zero(&ret->d_lockref); rcu_read_unlock(); if (likely(gotref)) { if (!read_seqcount_retry(&dentry->d_seq, seq)) return ret; dput(ret); } repeat: /* * Don't need rcu_dereference because we re-check it was correct under * the lock. */ rcu_read_lock(); ret = dentry->d_parent; spin_lock(&ret->d_lock); if (unlikely(ret != dentry->d_parent)) { spin_unlock(&ret->d_lock); rcu_read_unlock(); goto repeat; } rcu_read_unlock(); BUG_ON(!ret->d_lockref.count); ret->d_lockref.count++; spin_unlock(&ret->d_lock); return ret; } EXPORT_SYMBOL(dget_parent); static struct dentry * __d_find_any_alias(struct inode *inode) { struct dentry *alias; if (hlist_empty(&inode->i_dentry)) return NULL; alias = hlist_entry(inode->i_dentry.first, struct dentry, d_u.d_alias); __dget(alias); return alias; } /** * d_find_any_alias - find any alias for a given inode * @inode: inode to find an alias for * * If any aliases exist for the given inode, take and return a * reference for one of them. If no aliases exist, return %NULL. */ struct dentry *d_find_any_alias(struct inode *inode) { struct dentry *de; spin_lock(&inode->i_lock); de = __d_find_any_alias(inode); spin_unlock(&inode->i_lock); return de; } EXPORT_SYMBOL(d_find_any_alias); /** * d_find_alias - grab a hashed alias of inode * @inode: inode in question * * If inode has a hashed alias, or is a directory and has any alias, * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root * of a filesystem, or if the directory was renamed and d_revalidate * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. */ static struct dentry *__d_find_alias(struct inode *inode) { struct dentry *alias; if (S_ISDIR(inode->i_mode)) return __d_find_any_alias(inode); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { __dget_dlock(alias); spin_unlock(&alias->d_lock); return alias; } spin_unlock(&alias->d_lock); } return NULL; } struct dentry *d_find_alias(struct inode *inode) { struct dentry *de = NULL; if (!hlist_empty(&inode->i_dentry)) { spin_lock(&inode->i_lock); de = __d_find_alias(inode); spin_unlock(&inode->i_lock); } return de; } EXPORT_SYMBOL(d_find_alias); /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. */ void d_prune_aliases(struct inode *inode) { struct dentry *dentry; restart: spin_lock(&inode->i_lock); hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) { struct dentry *parent = lock_parent(dentry); if (likely(!dentry->d_lockref.count)) { __dentry_kill(dentry); dput(parent); goto restart; } if (parent) spin_unlock(&parent->d_lock); } spin_unlock(&dentry->d_lock); } spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_prune_aliases); /* * Lock a dentry from shrink list. * Called under rcu_read_lock() and dentry->d_lock; the former * guarantees that nothing we access will be freed under us. * Note that dentry is *not* protected from concurrent dentry_kill(), * d_delete(), etc. * * Return false if dentry has been disrupted or grabbed, leaving * the caller to kick it off-list. Otherwise, return true and have * that dentry's inode and parent both locked. */ static bool shrink_lock_dentry(struct dentry *dentry) { struct inode *inode; struct dentry *parent; if (dentry->d_lockref.count) return false; inode = dentry->d_inode; if (inode && unlikely(!spin_trylock(&inode->i_lock))) { spin_unlock(&dentry->d_lock); spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); if (unlikely(dentry->d_lockref.count)) goto out; /* changed inode means that somebody had grabbed it */ if (unlikely(inode != dentry->d_inode)) goto out; } parent = dentry->d_parent; if (IS_ROOT(dentry) || likely(spin_trylock(&parent->d_lock))) return true; spin_unlock(&dentry->d_lock); spin_lock(&parent->d_lock); if (unlikely(parent != dentry->d_parent)) { spin_unlock(&parent->d_lock); spin_lock(&dentry->d_lock); goto out; } spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); if (likely(!dentry->d_lockref.count)) return true; spin_unlock(&parent->d_lock); out: if (inode) spin_unlock(&inode->i_lock); return false; } void shrink_dentry_list(struct list_head *list) { while (!list_empty(list)) { struct dentry *dentry, *parent; dentry = list_entry(list->prev, struct dentry, d_lru); spin_lock(&dentry->d_lock); rcu_read_lock(); if (!shrink_lock_dentry(dentry)) { bool can_free = false; rcu_read_unlock(); d_shrink_del(dentry); if (dentry->d_lockref.count < 0) can_free = dentry->d_flags & DCACHE_MAY_FREE; spin_unlock(&dentry->d_lock); if (can_free) dentry_free(dentry); continue; } rcu_read_unlock(); d_shrink_del(dentry); parent = dentry->d_parent; if (parent != dentry) __dput_to_list(parent, list); __dentry_kill(dentry); } } static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; /* * Referenced dentries are still in use. If they have active * counts, just remove them from the LRU. Otherwise give them * another pass through the LRU. */ if (dentry->d_lockref.count) { d_lru_isolate(lru, dentry); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } if (dentry->d_flags & DCACHE_REFERENCED) { dentry->d_flags &= ~DCACHE_REFERENCED; spin_unlock(&dentry->d_lock); /* * The list move itself will be made by the common LRU code. At * this point, we've dropped the dentry->d_lock but keep the * lru lock. This is safe to do, since every list movement is * protected by the lru lock even if both locks are held. * * This is guaranteed by the fact that all LRU management * functions are intermediated by the LRU API calls like * list_lru_add and list_lru_del. List movement in this file * only ever occur through this functions or through callbacks * like this one, that are called from the LRU API. * * The only exceptions to this are functions like * shrink_dentry_list, and code that first checks for the * DCACHE_SHRINK_LIST flag. Those are guaranteed to be * operating only with stack provided lists after they are * properly isolated from the main list. It is thus, always a * local access. */ return LRU_ROTATE; } d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * prune_dcache_sb - shrink the dcache * @sb: superblock * @sc: shrink control, passed to list_lru_shrink_walk() * * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This * is done when we need more memory and called from the superblock shrinker * function. * * This function may fail to free any resources if all the dentries are in * use. */ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) { LIST_HEAD(dispose); long freed; freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc, dentry_lru_isolate, &dispose); shrink_dentry_list(&dispose); return freed; } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); /* * we are inverting the lru lock/dentry->d_lock here, * so use a trylock. If we fail to get the lock, just skip * it */ if (!spin_trylock(&dentry->d_lock)) return LRU_SKIP; d_lru_shrink_move(lru, dentry, freeable); spin_unlock(&dentry->d_lock); return LRU_REMOVED; } /** * shrink_dcache_sb - shrink dcache for a superblock * @sb: superblock * * Shrink the dcache for the specified super block. This is used to free * the dcache before unmounting a file system. */ void shrink_dcache_sb(struct super_block *sb) { do { LIST_HEAD(dispose); list_lru_walk(&sb->s_dentry_lru, dentry_lru_isolate_shrink, &dispose, 1024); shrink_dentry_list(&dispose); } while (list_lru_count(&sb->s_dentry_lru) > 0); } EXPORT_SYMBOL(shrink_dcache_sb); /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk * @D_WALK_QUIT: quit walk * @D_WALK_NORETRY: quit when retry is needed * @D_WALK_SKIP: skip this dentry and its children */ enum d_walk_ret { D_WALK_CONTINUE, D_WALK_QUIT, D_WALK_NORETRY, D_WALK_SKIP, }; /** * d_walk - walk the dentry tree * @parent: start of walk * @data: data passed to @enter() and @finish() * @enter: callback when first entering the dentry * * The @enter() callbacks are called with d_lock held. */ static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *)) { struct dentry *this_parent; struct list_head *next; unsigned seq = 0; enum d_walk_ret ret; bool retry = true; again: read_seqbegin_or_lock(&rename_lock, &seq); this_parent = parent; spin_lock(&this_parent->d_lock); ret = enter(data, this_parent); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: case D_WALK_SKIP: goto out_unlock; case D_WALK_NORETRY: retry = false; break; } repeat: next = this_parent->d_subdirs.next; resume: while (next != &this_parent->d_subdirs) { struct list_head *tmp = next; struct dentry *dentry = list_entry(tmp, struct dentry, d_child); next = tmp->next; if (unlikely(dentry->d_flags & DCACHE_DENTRY_CURSOR)) continue; spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); ret = enter(data, dentry); switch (ret) { case D_WALK_CONTINUE: break; case D_WALK_QUIT: spin_unlock(&dentry->d_lock); goto out_unlock; case D_WALK_NORETRY: retry = false; break; case D_WALK_SKIP: spin_unlock(&dentry->d_lock); continue; } if (!list_empty(&dentry->d_subdirs)) { spin_unlock(&this_parent->d_lock); spin_release(&dentry->d_lock.dep_map, _RET_IP_); this_parent = dentry; spin_acquire(&this_parent->d_lock.dep_map, 0, 1, _RET_IP_); goto repeat; } spin_unlock(&dentry->d_lock); } /* * All done at this level ... ascend and resume the search. */ rcu_read_lock(); ascend: if (this_parent != parent) { struct dentry *child = this_parent; this_parent = child->d_parent; spin_unlock(&child->d_lock); spin_lock(&this_parent->d_lock); /* might go back up the wrong parent if we have had a rename. */ if (need_seqretry(&rename_lock, seq)) goto rename_retry; /* go into the first sibling still alive */ do { next = child->d_child.next; if (next == &this_parent->d_subdirs) goto ascend; child = list_entry(next, struct dentry, d_child); } while (unlikely(child->d_flags & DCACHE_DENTRY_KILLED)); rcu_read_unlock(); goto resume; } if (need_seqretry(&rename_lock, seq)) goto rename_retry; rcu_read_unlock(); out_unlock: spin_unlock(&this_parent->d_lock); done_seqretry(&rename_lock, seq); return; rename_retry: spin_unlock(&this_parent->d_lock); rcu_read_unlock(); BUG_ON(seq & 1); if (!retry) return; seq = 1; goto again; } struct check_mount { struct vfsmount *mnt; unsigned int mounted; }; static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; struct path path = { .mnt = info->mnt, .dentry = dentry }; if (likely(!d_mountpoint(dentry))) return D_WALK_CONTINUE; if (__path_is_mountpoint(&path)) { info->mounted = 1; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * path_has_submounts - check for mounts over a dentry in the * current namespace. * @parent: path to check. * * Return true if the parent or its subdirectories contain * a mount point in the current namespace. */ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; read_seqlock_excl(&mount_lock); d_walk(parent->dentry, &data, path_check_mount); read_sequnlock_excl(&mount_lock); return data.mounted; } EXPORT_SYMBOL(path_has_submounts); /* * Called by mount code to set a mountpoint and check if the mountpoint is * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) { struct dentry *p; int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); goto out; } spin_unlock(&p->d_lock); } spin_lock(&dentry->d_lock); if (!d_unlinked(dentry)) { ret = -EBUSY; if (!d_mountpoint(dentry)) { dentry->d_flags |= DCACHE_MOUNTED; ret = 0; } } spin_unlock(&dentry->d_lock); out: write_sequnlock(&rename_lock); return ret; } /* * Search the dentry child list of the specified parent, * and move any unused dentries to the end of the unused * list for prune_dcache(). We descend to the next level * whenever the d_subdirs list is non-empty and continue * searching. * * It returns zero iff there are no unused children, * otherwise it returns the number of children moved to * the end of the unused list. This may not be the total * number of unused children, because select_parent can * drop the lock and return early due to latency * constraints. */ struct select_data { struct dentry *start; union { long found; struct dentry *victim; }; struct list_head dispose; }; static enum d_walk_ret select_collect(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { data->found++; } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!dentry->d_lockref.count) { d_shrink_add(dentry, &data->dispose); data->found++; } } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; enum d_walk_ret ret = D_WALK_CONTINUE; if (data->start == dentry) goto out; if (dentry->d_flags & DCACHE_SHRINK_LIST) { if (!dentry->d_lockref.count) { rcu_read_lock(); data->victim = dentry; return D_WALK_QUIT; } } else { if (dentry->d_flags & DCACHE_LRU_LIST) d_lru_del(dentry); if (!dentry->d_lockref.count) d_shrink_add(dentry, &data->dispose); } /* * We can return to the caller if we have found some (this * ensures forward progress). We'll be coming back to find * the rest. */ if (!list_empty(&data->dispose)) ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY; out: return ret; } /** * shrink_dcache_parent - prune dcache * @parent: parent of entries to prune * * Prune the dcache to remove unused children of the parent dentry. */ void shrink_dcache_parent(struct dentry *parent) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); d_walk(parent, &data, select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); continue; } cond_resched(); if (!data.found) break; data.victim = NULL; d_walk(parent, &data, select_collect2); if (data.victim) { struct dentry *parent; spin_lock(&data.victim->d_lock); if (!shrink_lock_dentry(data.victim)) { spin_unlock(&data.victim->d_lock); rcu_read_unlock(); } else { rcu_read_unlock(); parent = data.victim->d_parent; if (parent != data.victim) __dput_to_list(parent, &data.dispose); __dentry_kill(data.victim); } } if (!list_empty(&data.dispose)) shrink_dentry_list(&data.dispose); } } EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) { /* it has busy descendents; complain about those instead */ if (!list_empty(&dentry->d_subdirs)) return D_WALK_CONTINUE; /* root with refcount 1 is fine */ if (dentry == _data && dentry->d_lockref.count == 1) return D_WALK_CONTINUE; printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} " " still in use (%d) [unmount of %s %s]\n", dentry, dentry->d_inode ? dentry->d_inode->i_ino : 0UL, dentry, dentry->d_lockref.count, dentry->d_sb->s_type->name, dentry->d_sb->s_id); WARN_ON(1); return D_WALK_CONTINUE; } static void do_one_tree(struct dentry *dentry) { shrink_dcache_parent(dentry); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); } /* * destroy the dentries attached to a superblock on unmounting */ void shrink_dcache_for_umount(struct super_block *sb) { struct dentry *dentry; WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked"); dentry = sb->s_root; sb->s_root = NULL; do_one_tree(dentry); while (!hlist_bl_empty(&sb->s_roots)) { dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_roots), struct dentry, d_hash)); do_one_tree(dentry); } } static enum d_walk_ret find_submount(void *_data, struct dentry *dentry) { struct dentry **victim = _data; if (d_mountpoint(dentry)) { __dget_dlock(dentry); *victim = dentry; return D_WALK_QUIT; } return D_WALK_CONTINUE; } /** * d_invalidate - detach submounts, prune dcache, and drop * @dentry: dentry to invalidate (aka detach, prune and drop) */ void d_invalidate(struct dentry *dentry) { bool had_submounts = false; spin_lock(&dentry->d_lock); if (d_unhashed(dentry)) { spin_unlock(&dentry->d_lock); return; } __d_drop(dentry); spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) return; shrink_dcache_parent(dentry); for (;;) { struct dentry *victim = NULL; d_walk(dentry, &victim, find_submount); if (!victim) { if (had_submounts) shrink_dcache_parent(dentry); return; } had_submounts = true; detach_mounts(victim); dput(victim); } } EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) { struct dentry *dentry; char *dname; int err; dentry = kmem_cache_alloc(dentry_cache, GFP_KERNEL); if (!dentry) return NULL; /* * We guarantee that the inline name is always NUL-terminated. * This way the memcpy() done by the name switching in rename * will still always have a NUL at the end, even if we might * be overwriting an internal NUL character */ dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (unlikely(!name)) { name = &slash_name; dname = dentry->d_iname; } else if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); struct external_name *p = kmalloc(size + name->len, GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } atomic_set(&p->u.count, 1); dname = p->name; } else { dname = dentry->d_iname; } dentry->d_name.len = name->len; dentry->d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ dentry->d_lockref.count = 1; dentry->d_flags = 0; spin_lock_init(&dentry->d_lock); seqcount_spinlock_init(&dentry->d_seq, &dentry->d_lock); dentry->d_inode = NULL; dentry->d_parent = dentry; dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_child); d_set_d_op(dentry, dentry->d_sb->s_d_op); if (dentry->d_op && dentry->d_op->d_init) { err = dentry->d_op->d_init(dentry); if (err) { if (dname_external(dentry)) kfree(external_name(dentry)); kmem_cache_free(dentry_cache, dentry); return NULL; } } this_cpu_inc(nr_dentry); return dentry; } /** * d_alloc - allocate a dcache entry * @parent: parent of entry to allocate * @name: qstr of the name * * Allocates a dentry. It returns %NULL if there is insufficient memory * available. On a success the dentry is returned. The name passed in is * copied and the copy passed in may be reused after this call. */ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) { struct dentry *dentry = __d_alloc(parent->d_sb, name); if (!dentry) return NULL; spin_lock(&parent->d_lock); /* * don't need child lock because it is not subject * to concurrency here */ __dget_dlock(parent); dentry->d_parent = parent; list_add(&dentry->d_child, &parent->d_subdirs); spin_unlock(&parent->d_lock); return dentry; } EXPORT_SYMBOL(d_alloc); struct dentry *d_alloc_anon(struct super_block *sb) { return __d_alloc(sb, NULL); } EXPORT_SYMBOL(d_alloc_anon); struct dentry *d_alloc_cursor(struct dentry * parent) { struct dentry *dentry = d_alloc_anon(parent->d_sb); if (dentry) { dentry->d_flags |= DCACHE_DENTRY_CURSOR; dentry->d_parent = dget(parent); } return dentry; } /** * d_alloc_pseudo - allocate a dentry (for lookup-less filesystems) * @sb: the superblock * @name: qstr of the name * * For a filesystem that just pins its dentries in memory and never * performs lookups at all, return an unhashed IS_ROOT dentry. * This is used for pipes, sockets et.al. - the stuff that should * never be anyone's children or parents. Unlike all other * dentries, these will not have RCU delay between dropping the * last reference and freeing them. * * The only user is alloc_file_pseudo() and that's what should * be considered a public interface. Don't use directly. */ struct dentry *d_alloc_pseudo(struct super_block *sb, const struct qstr *name) { struct dentry *dentry = __d_alloc(sb, name); if (likely(dentry)) dentry->d_flags |= DCACHE_NORCU; return dentry; } struct dentry *d_alloc_name(struct dentry *parent, const char *name) { struct qstr q; q.name = name; q.hash_len = hashlen_string(parent, name); return d_alloc(parent, &q); } EXPORT_SYMBOL(d_alloc_name); void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) { WARN_ON_ONCE(dentry->d_op); WARN_ON_ONCE(dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | DCACHE_OP_DELETE | DCACHE_OP_REAL)); dentry->d_op = op; if (!op) return; if (op->d_hash) dentry->d_flags |= DCACHE_OP_HASH; if (op->d_compare) dentry->d_flags |= DCACHE_OP_COMPARE; if (op->d_revalidate) dentry->d_flags |= DCACHE_OP_REVALIDATE; if (op->d_weak_revalidate) dentry->d_flags |= DCACHE_OP_WEAK_REVALIDATE; if (op->d_delete) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; if (op->d_real) dentry->d_flags |= DCACHE_OP_REAL; } EXPORT_SYMBOL(d_set_d_op); /* * d_set_fallthru - Mark a dentry as falling through to a lower layer * @dentry - The dentry to mark * * Mark a dentry as falling through to the lower layer (as set with * d_pin_lower()). This flag may be recorded on the medium. */ void d_set_fallthru(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_FALLTHRU; spin_unlock(&dentry->d_lock); } EXPORT_SYMBOL(d_set_fallthru); static unsigned d_flags_for_inode(struct inode *inode) { unsigned add_flags = DCACHE_REGULAR_TYPE; if (!inode) return DCACHE_MISS_TYPE; if (S_ISDIR(inode->i_mode)) { add_flags = DCACHE_DIRECTORY_TYPE; if (unlikely(!(inode->i_opflags & IOP_LOOKUP))) { if (unlikely(!inode->i_op->lookup)) add_flags = DCACHE_AUTODIR_TYPE; else inode->i_opflags |= IOP_LOOKUP; } goto type_determined; } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } inode->i_opflags |= IOP_NOFOLLOW; } if (unlikely(!S_ISREG(inode->i_mode))) add_flags = DCACHE_SPECIAL_TYPE; type_determined: if (unlikely(IS_AUTOMOUNT(inode))) add_flags |= DCACHE_NEED_AUTOMOUNT; return add_flags; } static void __d_instantiate(struct dentry *dentry, struct inode *inode) { unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); spin_lock(&dentry->d_lock); /* * Decrement negative dentry count if it was in the LRU list. */ if (dentry->d_flags & DCACHE_LRU_LIST) this_cpu_dec(nr_dentry_negative); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); spin_unlock(&dentry->d_lock); } /** * d_instantiate - fill in inode information for a dentry * @entry: dentry to complete * @inode: inode to attach to this dentry * * Fill in inode information in the entry. * * This turns negative dentries into productive full members * of society. * * NOTE! This assumes that the inode count has been incremented * (or otherwise set) by the caller to indicate that it is now * in use by the dcache. */ void d_instantiate(struct dentry *entry, struct inode * inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_instantiate); /* * This should be equivalent to d_instantiate() + unlock_new_inode(), * with lockdep-related part of unlock_new_inode() done before * anything else. Use that instead of open-coding d_instantiate()/ * unlock_new_inode() combinations. */ void d_instantiate_new(struct dentry *entry, struct inode *inode) { BUG_ON(!hlist_unhashed(&entry->d_u.d_alias)); BUG_ON(!inode); lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); __d_instantiate(entry, inode); WARN_ON(!(inode->i_state & I_NEW)); inode->i_state &= ~I_NEW & ~I_CREATING; smp_mb(); wake_up_bit(&inode->i_state, __I_NEW); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL(d_instantiate_new); struct dentry *d_make_root(struct inode *root_inode) { struct dentry *res = NULL; if (root_inode) { res = d_alloc_anon(root_inode->i_sb); if (res) d_instantiate(res, root_inode); else iput(root_inode); } return res; } EXPORT_SYMBOL(d_make_root); static struct dentry *__d_instantiate_anon(struct dentry *dentry, struct inode *inode, bool disconnected) { struct dentry *res; unsigned add_flags; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); res = __d_find_any_alias(inode); if (res) { spin_unlock(&inode->i_lock); dput(dentry); goto out_iput; } /* attach a disconnected dentry */ add_flags = d_flags_for_inode(inode); if (disconnected) add_flags |= DCACHE_DISCONNECTED; spin_lock(&dentry->d_lock); __d_set_inode_and_type(dentry, inode, add_flags); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); if (!disconnected) { hlist_bl_lock(&dentry->d_sb->s_roots); hlist_bl_add_head(&dentry->d_hash, &dentry->d_sb->s_roots); hlist_bl_unlock(&dentry->d_sb->s_roots); } spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); return dentry; out_iput: iput(inode); return res; } struct dentry *d_instantiate_anon(struct dentry *dentry, struct inode *inode) { return __d_instantiate_anon(dentry, inode, true); } EXPORT_SYMBOL(d_instantiate_anon); static struct dentry *__d_obtain_alias(struct inode *inode, bool disconnected) { struct dentry *tmp; struct dentry *res; if (!inode) return ERR_PTR(-ESTALE); if (IS_ERR(inode)) return ERR_CAST(inode); res = d_find_any_alias(inode); if (res) goto out_iput; tmp = d_alloc_anon(inode->i_sb); if (!tmp) { res = ERR_PTR(-ENOMEM); goto out_iput; } return __d_instantiate_anon(tmp, inode, disconnected); out_iput: iput(inode); return res; } /** * d_obtain_alias - find or allocate a DISCONNECTED dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain a dentry for an inode resulting from NFS filehandle conversion or * similar open by handle operations. The returned dentry may be anonymous, * or may have a full name (if the inode was already in the cache). * * When called on a directory inode, we must ensure that the inode only ever * has one dentry. If a dentry is found, that is returned instead of * allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is released. * To make it easier to use in export operations a %NULL or IS_ERR inode may * be passed in and the error will be propagated to the return value, * with a %NULL @inode replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_alias(struct inode *inode) { return __d_obtain_alias(inode, true); } EXPORT_SYMBOL(d_obtain_alias); /** * d_obtain_root - find or allocate a dentry for a given inode * @inode: inode to allocate the dentry for * * Obtain an IS_ROOT dentry for the root of a filesystem. * * We must ensure that directory inodes only ever have one dentry. If a * dentry is found, that is returned instead of allocating a new one. * * On successful return, the reference to the inode has been transferred * to the dentry. In case of an error the reference on the inode is * released. A %NULL or IS_ERR inode may be passed in and will be the * error will be propagate to the return value, with a %NULL @inode * replaced by ERR_PTR(-ESTALE). */ struct dentry *d_obtain_root(struct inode *inode) { return __d_obtain_alias(inode, false); } EXPORT_SYMBOL(d_obtain_root); /** * d_add_ci - lookup or allocate new dentry with case-exact name * @inode: the inode case-insensitive lookup has found * @dentry: the negative dentry that was passed to the parent's lookup func * @name: the case-exact name to be associated with the returned dentry * * This is to avoid filling the dcache with case-insensitive names to the * same inode, only the actual correct case is stored in the dcache for * case-insensitive filesystems. * * For a case-insensitive lookup match and if the the case-exact dentry * already exists in in the dcache, use it and return it. * * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { struct dentry *found, *res; /* * First check if a dentry matching the name already exists, * if not go ahead and create it now. */ found = d_hash_and_lookup(dentry->d_parent, name); if (found) { iput(inode); return found; } if (d_in_lookup(dentry)) { found = d_alloc_parallel(dentry->d_parent, name, dentry->d_wait); if (IS_ERR(found) || !d_in_lookup(found)) { iput(inode); return found; } } else { found = d_alloc(dentry->d_parent, name); if (!found) { iput(inode); return ERR_PTR(-ENOMEM); } } res = d_splice_alias(inode, found); if (res) { dput(found); return res; } return found; } EXPORT_SYMBOL(d_add_ci); static inline bool d_same_name(const struct dentry *dentry, const struct dentry *parent, const struct qstr *name) { if (likely(!(parent->d_flags & DCACHE_OP_COMPARE))) { if (dentry->d_name.len != name->len) return false; return dentry_cmp(dentry, name->name, name->len) == 0; } return parent->d_op->d_compare(dentry, dentry->d_name.len, dentry->d_name.name, name) == 0; } /** * __d_lookup_rcu - search for a dentry (racy, store-free) * @parent: parent dentry * @name: qstr of name we wish to find * @seqp: returns d_seq value at the point where the dentry was found * Returns: dentry, or NULL * * __d_lookup_rcu is the dcache lookup function for rcu-walk name * resolution (store-free path walking) design described in * Documentation/filesystems/path-lookup.txt. * * This is not to be used outside core vfs. * * __d_lookup_rcu must only be used in rcu-walk mode, ie. with vfsmount lock * held, and rcu_read_lock held. The returned dentry must not be stored into * without taking d_lock and checking d_seq sequence count against @seq * returned here. * * A refcount may be taken on the found dentry with the d_rcu_to_refcount * function. * * Alternatively, __d_lookup_rcu may be called again to look up the child of * the returned dentry, so long as its parent's seqlock is checked after the * child is looked up. Thus, an interlocking stepping of sequence lock checks * is formed, giving integrity down the path walk. * * NOTE! The caller *has* to check the resulting dentry against the sequence * number we've returned before using any of the resulting dentry state! */ struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seqp) { u64 hashlen = name->hash_len; const unsigned char *str = name->name; struct hlist_bl_head *b = d_hash(hashlen_hash(hashlen)); struct hlist_bl_node *node; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Carefully use d_seq when comparing a candidate dentry, to avoid * races with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { unsigned seq; seqretry: /* * The dentry sequence count protects us from concurrent * renames, and thus protects parent and name fields. * * The caller must perform a seqcount check in order * to do anything useful with the returned dentry. * * NOTE! We do a "raw" seqcount_begin here. That means that * we don't wait for the sequence count to stabilize if it * is in the middle of a sequence change. If we do the slow * dentry compare, we will do seqretries until it is stable, * and if we end up with a successful lookup, we actually * want to exit RCU lookup anyway. * * Note that raw_seqcount_begin still *does* smp_rmb(), so * we are still guaranteed NUL-termination of ->d_name.name. */ seq = raw_seqcount_begin(&dentry->d_seq); if (dentry->d_parent != parent) continue; if (d_unhashed(dentry)) continue; if (unlikely(parent->d_flags & DCACHE_OP_COMPARE)) { int tlen; const char *tname; if (dentry->d_name.hash != hashlen_hash(hashlen)) continue; tlen = dentry->d_name.len; tname = dentry->d_name.name; /* we want a consistent (name,len) pair */ if (read_seqcount_retry(&dentry->d_seq, seq)) { cpu_relax(); goto seqretry; } if (parent->d_op->d_compare(dentry, tlen, tname, name) != 0) continue; } else { if (dentry->d_name.hash_len != hashlen) continue; if (dentry_cmp(dentry, str, hashlen_len(hashlen)) != 0) continue; } *seqp = seq; return dentry; } return NULL; } /** * d_lookup - search for a dentry * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * d_lookup searches the children of the parent dentry for the name in * question. If the dentry is found its reference count is incremented and the * dentry is returned. The caller must use dput to free the entry when it has * finished using it. %NULL is returned if the dentry does not exist. */ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) { struct dentry *dentry; unsigned seq; do { seq = read_seqbegin(&rename_lock); dentry = __d_lookup(parent, name); if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; } EXPORT_SYMBOL(d_lookup); /** * __d_lookup - search for a dentry (racy) * @parent: parent dentry * @name: qstr of name we wish to find * Returns: dentry, or NULL * * __d_lookup is like d_lookup, however it may (rarely) return a * false-negative result due to unrelated rename activity. * * __d_lookup is slightly faster by avoiding rename_lock read seqlock, * however it must be used carefully, eg. with a following d_lookup in * the case of failure. * * __d_lookup callers must be commented. */ struct dentry *__d_lookup(const struct dentry *parent, const struct qstr *name) { unsigned int hash = name->hash; struct hlist_bl_head *b = d_hash(hash); struct hlist_bl_node *node; struct dentry *found = NULL; struct dentry *dentry; /* * Note: There is significant duplication with __d_lookup_rcu which is * required to prevent single threaded performance regressions * especially on architectures where smp_rmb (in seqcounts) are costly. * Keep the two functions in sync. */ /* * The hash list is protected using RCU. * * Take d_lock when comparing a candidate dentry, to avoid races * with d_move(). * * It is possible that concurrent renames can mess up our list * walk here and result in missing our dentry, resulting in the * false-negative result. d_lookup() protects against concurrent * renames using rename_lock seqlock. * * See Documentation/filesystems/path-lookup.txt for more details. */ rcu_read_lock(); hlist_bl_for_each_entry_rcu(dentry, node, b, d_hash) { if (dentry->d_name.hash != hash) continue; spin_lock(&dentry->d_lock); if (dentry->d_parent != parent) goto next; if (d_unhashed(dentry)) goto next; if (!d_same_name(dentry, parent, name)) goto next; dentry->d_lockref.count++; found = dentry; spin_unlock(&dentry->d_lock); break; next: spin_unlock(&dentry->d_lock); } rcu_read_unlock(); return found; } /** * d_hash_and_lookup - hash the qstr then search for a dentry * @dir: Directory to search in * @name: qstr of name we wish to find * * On lookup failure NULL is returned; on bad name - ERR_PTR(-error) */ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) { /* * Check for a fs-specific hash function. Note that we must * calculate the standard hash first, as the d_op->d_hash() * routine may choose to leave the hash value unchanged. */ name->hash = full_name_hash(dir, name->name, name->len); if (dir->d_flags & DCACHE_OP_HASH) { int err = dir->d_op->d_hash(dir, name); if (unlikely(err < 0)) return ERR_PTR(err); } return d_lookup(dir, name); } EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: * - turn this dentry into a negative dentry * - unhash this dentry and free it. * * Usually, we want to just turn this into * a negative dentry, but if anybody else is * currently using the dentry or the inode * we can't do that and we fall back on removing * it from the hash queues and waiting for * it to be deleted later when it has no users */ /** * d_delete - delete a dentry * @dentry: The dentry to delete * * Turn the dentry into a negative dentry if possible, otherwise * remove it from the hash queues so it can be deleted later */ void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); /* * Are we the only user? */ if (dentry->d_lockref.count == 1) { dentry->d_flags &= ~DCACHE_CANT_MOUNT; dentry_unlink_inode(dentry); } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); } } EXPORT_SYMBOL(d_delete); static void __d_rehash(struct dentry *entry) { struct hlist_bl_head *b = d_hash(entry->d_name.hash); hlist_bl_lock(b); hlist_bl_add_head_rcu(&entry->d_hash, b); hlist_bl_unlock(b); } /** * d_rehash - add an entry back to the hash * @entry: dentry to add to the hash * * Adds a dentry to the hash according to its name. */ void d_rehash(struct dentry * entry) { spin_lock(&entry->d_lock); __d_rehash(entry); spin_unlock(&entry->d_lock); } EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { for (;;) { unsigned n = dir->i_dir_seq; if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } } static inline void end_dir_add(struct inode *dir, unsigned n) { smp_store_release(&dir->i_dir_seq, n + 2); } static void d_wait_lookup(struct dentry *dentry) { if (d_in_lookup(dentry)) { DECLARE_WAITQUEUE(wait, current); add_wait_queue(dentry->d_wait, &wait); do { set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock(&dentry->d_lock); schedule(); spin_lock(&dentry->d_lock); } while (d_in_lookup(dentry)); } } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, wait_queue_head_t *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); struct hlist_bl_node *node; struct dentry *new = d_alloc(parent, name); struct dentry *dentry; unsigned seq, r_seq, d_seq; if (unlikely(!new)) return ERR_PTR(-ENOMEM); retry: rcu_read_lock(); seq = smp_load_acquire(&parent->d_inode->i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } if (read_seqcount_retry(&dentry->d_seq, d_seq)) { rcu_read_unlock(); dput(dentry); goto retry; } rcu_read_unlock(); dput(new); return dentry; } if (unlikely(read_seqretry(&rename_lock, r_seq))) { rcu_read_unlock(); goto retry; } if (unlikely(seq & 1)) { rcu_read_unlock(); goto retry; } hlist_bl_lock(b); if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; } /* * No changes for the parent since the beginning of d_lookup(). * Since all removals from the chain happen with hlist_bl_lock(), * any potential in-lookup matches are going to stay here until * we unlock the chain. All fields are stable in everything * we encounter. */ hlist_bl_for_each_entry(dentry, node, b, d_u.d_in_lookup_hash) { if (dentry->d_name.hash != hash) continue; if (dentry->d_parent != parent) continue; if (!d_same_name(dentry, parent, name)) continue; hlist_bl_unlock(b); /* now we can try to grab a reference */ if (!lockref_get_not_dead(&dentry->d_lockref)) { rcu_read_unlock(); goto retry; } rcu_read_unlock(); /* * somebody is likely to be still doing lookup for it; * wait for them to finish */ spin_lock(&dentry->d_lock); d_wait_lookup(dentry); /* * it's not in-lookup anymore; in principle we should repeat * everything from dcache lookup, but it's likely to be what * d_lookup() would've found anyway. If it is, just return it; * otherwise we really have to repeat the whole thing. */ if (unlikely(dentry->d_name.hash != hash)) goto mismatch; if (unlikely(dentry->d_parent != parent)) goto mismatch; if (unlikely(d_unhashed(dentry))) goto mismatch; if (unlikely(!d_same_name(dentry, parent, name))) goto mismatch; /* OK, it *is* a hashed match; return it */ spin_unlock(&dentry->d_lock); dput(new); return dentry; } rcu_read_unlock(); /* we can't take ->d_lock here; it's OK, though. */ new->d_flags |= DCACHE_PAR_LOOKUP; new->d_wait = wq; hlist_bl_add_head_rcu(&new->d_u.d_in_lookup_hash, b); hlist_bl_unlock(b); return new; mismatch: spin_unlock(&dentry->d_lock); dput(dentry); goto retry; } EXPORT_SYMBOL(d_alloc_parallel); void __d_lookup_done(struct dentry *dentry) { struct hlist_bl_head *b = in_lookup_hash(dentry->d_parent, dentry->d_name.hash); hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); wake_up_all(dentry->d_wait); dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); INIT_LIST_HEAD(&dentry->d_lru); } EXPORT_SYMBOL(__d_lookup_done); /* inode->i_lock held if inode is non-NULL */ static inline void __d_add(struct dentry *dentry, struct inode *inode) { struct inode *dir = NULL; unsigned n; spin_lock(&dentry->d_lock); if (unlikely(d_in_lookup(dentry))) { dir = dentry->d_parent->d_inode; n = start_dir_add(dir); __d_lookup_done(dentry); } if (inode) { unsigned add_flags = d_flags_for_inode(inode); hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); raw_write_seqcount_begin(&dentry->d_seq); __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); } __d_rehash(dentry); if (dir) end_dir_add(dir, n); spin_unlock(&dentry->d_lock); if (inode) spin_unlock(&inode->i_lock); } /** * d_add - add dentry to hash queues * @entry: dentry to add * @inode: The inode to attach to this dentry * * This adds the entry to the hash queues and initializes @inode. * The entry was actually filled in earlier during d_alloc(). */ void d_add(struct dentry *entry, struct inode *inode) { if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); } __d_add(entry, inode); } EXPORT_SYMBOL(d_add); /** * d_exact_alias - find and hash an exact unhashed alias * @entry: dentry to add * @inode: The inode to go with this dentry * * If an unhashed dentry with the same name/parent and desired * inode already exists, hash and return it. Otherwise, return * NULL. * * Parent directory should be locked. */ struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) { struct dentry *alias; unsigned int hash = entry->d_name.hash; spin_lock(&inode->i_lock); hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { /* * Don't need alias->d_lock here, because aliases with * d_parent == entry->d_parent are not subject to name or * parent changes, because the parent inode i_mutex is held. */ if (alias->d_name.hash != hash) continue; if (alias->d_parent != entry->d_parent) continue; if (!d_same_name(alias, entry->d_parent, &entry->d_name)) continue; spin_lock(&alias->d_lock); if (!d_unhashed(alias)) { spin_unlock(&alias->d_lock); alias = NULL; } else { __dget_dlock(alias); __d_rehash(alias); spin_unlock(&alias->d_lock); } spin_unlock(&inode->i_lock); return alias; } spin_unlock(&inode->i_lock); return NULL; } EXPORT_SYMBOL(d_exact_alias); static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ memcpy(target->d_iname, dentry->d_name.name, dentry->d_name.len + 1); dentry->d_name.name = target->d_name.name; target->d_name.name = target->d_iname; } } else { if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); target->d_name.name = dentry->d_name.name; dentry->d_name.name = dentry->d_iname; } else { /* * Both are internal. */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); } } } swap(dentry->d_name.hash_len, target->d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) { struct external_name *old_name = NULL; if (unlikely(dname_external(dentry))) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->u.count); dentry->d_name = target->d_name; } else { memcpy(dentry->d_iname, target->d_name.name, target->d_name.len + 1); dentry->d_name.name = dentry->d_iname; dentry->d_name.hash_len = target->d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) kfree_rcu(old_name, u.head); } /* * __d_move - move a dentry * @dentry: entry to move * @target: new dentry * @exchange: exchange the two dentries * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. Caller must hold * rename_lock, the i_mutex of the source and target directories, * and the sb->s_vfs_rename_mutex if they differ. See lock_rename(). */ static void __d_move(struct dentry *dentry, struct dentry *target, bool exchange) { struct dentry *old_parent, *p; struct inode *dir = NULL; unsigned n; WARN_ON(!dentry->d_inode); if (WARN_ON(dentry == target)) return; BUG_ON(d_ancestor(target, dentry)); old_parent = dentry->d_parent; p = d_ancestor(old_parent, target); if (IS_ROOT(dentry)) { BUG_ON(p); spin_lock(&target->d_parent->d_lock); } else if (!p) { /* target is not a descendent of dentry->d_parent */ spin_lock(&target->d_parent->d_lock); spin_lock_nested(&old_parent->d_lock, DENTRY_D_LOCK_NESTED); } else { BUG_ON(p == dentry); spin_lock(&old_parent->d_lock); if (p != target) spin_lock_nested(&target->d_parent->d_lock, DENTRY_D_LOCK_NESTED); } spin_lock_nested(&dentry->d_lock, 2); spin_lock_nested(&target->d_lock, 3); if (unlikely(d_in_lookup(target))) { dir = target->d_parent->d_inode; n = start_dir_add(dir); __d_lookup_done(target); } write_seqcount_begin(&dentry->d_seq); write_seqcount_begin_nested(&target->d_seq, DENTRY_D_LOCK_NESTED); /* unhash both */ if (!d_unhashed(dentry)) ___d_drop(dentry); if (!d_unhashed(target)) ___d_drop(target); /* ... and switch them in the tree */ dentry->d_parent = target->d_parent; if (!exchange) { copy_name(dentry, target); target->d_hash.pprev = NULL; dentry->d_parent->d_lockref.count++; if (dentry != old_parent) /* wasn't IS_ROOT */ WARN_ON(!--old_parent->d_lockref.count); } else { target->d_parent = old_parent; swap_names(dentry, target); list_move(&target->d_child, &target->d_parent->d_subdirs); __d_rehash(target); fsnotify_update_flags(target); } list_move(&dentry->d_child, &dentry->d_parent->d_subdirs); __d_rehash(dentry); fsnotify_update_flags(dentry); fscrypt_handle_d_move(dentry); write_seqcount_end(&target->d_seq); write_seqcount_end(&dentry->d_seq); if (dir) end_dir_add(dir, n); if (dentry->d_parent != old_parent) spin_unlock(&dentry->d_parent->d_lock); if (dentry != old_parent) spin_unlock(&old_parent->d_lock); spin_unlock(&target->d_lock); spin_unlock(&dentry->d_lock); } /* * d_move - move a dentry * @dentry: entry to move * @target: new dentry * * Update the dcache to reflect the move of a file name. Negative * dcache entries should not be moved in this way. See the locking * requirements for __d_move. */ void d_move(struct dentry *dentry, struct dentry *target) { write_seqlock(&rename_lock); __d_move(dentry, target, false); write_sequnlock(&rename_lock); } EXPORT_SYMBOL(d_move); /* * d_exchange - exchange two dentries * @dentry1: first dentry * @dentry2: second dentry */ void d_exchange(struct dentry *dentry1, struct dentry *dentry2) { write_seqlock(&rename_lock); WARN_ON(!dentry1->d_inode); WARN_ON(!dentry2->d_inode); WARN_ON(IS_ROOT(dentry1)); WARN_ON(IS_ROOT(dentry2)); __d_move(dentry1, dentry2, true); write_sequnlock(&rename_lock); } /** * d_ancestor - search for an ancestor * @p1: ancestor dentry * @p2: child dentry * * Returns the ancestor dentry of p2 which is a child of p1, if p1 is * an ancestor of p2, else NULL. */ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) { struct dentry *p; for (p = p2; !IS_ROOT(p); p = p->d_parent) { if (p->d_parent == p1) return p; } return NULL; } /* * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... */ static int __d_unalias(struct inode *inode, struct dentry *dentry, struct dentry *alias) { struct mutex *m1 = NULL; struct rw_semaphore *m2 = NULL; int ret = -ESTALE; /* If alias and dentry share a parent, then no extra locks required */ if (alias->d_parent == dentry->d_parent) goto out_unalias; /* See lock_rename() */ if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; if (!inode_trylock_shared(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_rwsem; out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: if (m2) up_read(m2); if (m1) mutex_unlock(m1); return ret; } /** * d_splice_alias - splice a disconnected dentry into the tree if one exists * @inode: the inode which may have a disconnected dentry * @dentry: a negative dentry which we want to point to the inode. * * If inode is a directory and has an IS_ROOT alias, then d_move that in * place of the given dentry and return it, else simply d_add the inode * to the dentry and return NULL. * * If a non-IS_ROOT directory is found, the filesystem is corrupt, and * we should error out: directories can't have multiple aliases. * * This is needed in the lookup routine of any filesystem that is exportable * (via knfsd) so that we can build dcache paths to directories effectively. * * If a dentry was found and moved, then it is returned. Otherwise NULL * is returned. This matches the expected return value of ->lookup. * * Cluster filesystems may call this function with a negative, hashed dentry. * In that case, we know that the inode will be a regular file, and also this * will only occur during atomic_open. So we need to check for the dentry * being already hashed only in the final case. */ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) { if (IS_ERR(inode)) return ERR_CAST(inode); BUG_ON(!d_unhashed(dentry)); if (!inode) goto out; security_d_instantiate(dentry, inode); spin_lock(&inode->i_lock); if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { /* The reference to new ensures it remains an alias */ spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( "VFS: Lookup of '%s' in %s %s" " would have caused loop\n", dentry->d_name.name, inode->i_sb->s_type->name, inode->i_sb->s_id); } else if (!IS_ROOT(new)) { struct dentry *old_parent = dget(new->d_parent); int err = __d_unalias(inode, dentry, new); write_sequnlock(&rename_lock); if (err) { dput(new); new = ERR_PTR(err); } dput(old_parent); } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); } iput(inode); return new; } } out: __d_add(dentry, inode); return NULL; } EXPORT_SYMBOL(d_splice_alias); /* * Test whether new_dentry is a subdirectory of old_dentry. * * Trivially implemented using the dcache structure */ /** * is_subdir - is new dentry a subdirectory of old_dentry * @new_dentry: new dentry * @old_dentry: old dentry * * Returns true if new_dentry is a subdirectory of the parent (at any depth). * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { bool result; unsigned seq; if (new_dentry == old_dentry) return true; do { /* for restarting inner loop in case of seq retry */ seq = read_seqbegin(&rename_lock); /* * Need rcu_readlock to protect against the d_parent trashing * due to d_move */ rcu_read_lock(); if (d_ancestor(old_dentry, new_dentry)) result = true; else result = false; rcu_read_unlock(); } while (read_seqretry(&rename_lock, seq)); return result; } EXPORT_SYMBOL(is_subdir); static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) { struct dentry *root = data; if (dentry != root) { if (d_unhashed(dentry) || !dentry->d_inode) return D_WALK_SKIP; if (!(dentry->d_flags & DCACHE_GENOCIDE)) { dentry->d_flags |= DCACHE_GENOCIDE; dentry->d_lockref.count--; } } return D_WALK_CONTINUE; } void d_genocide(struct dentry *parent) { d_walk(parent, parent, d_genocide_kill); } EXPORT_SYMBOL(d_genocide); void d_tmpfile(struct dentry *dentry, struct inode *inode) { inode_dec_link_count(inode); BUG_ON(dentry->d_name.name != dentry->d_iname || !hlist_unhashed(&dentry->d_u.d_alias) || !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); dentry->d_name.len = sprintf(dentry->d_iname, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); d_instantiate(dentry, inode); } EXPORT_SYMBOL(d_tmpfile); static __initdata unsigned long dhash_entries; static int __init set_dhash_entries(char *str) { if (!str) return 0; dhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("dhash_entries=", set_dhash_entries); static void __init dcache_init_early(void) { /* If hashes are distributed across NUMA nodes, defer * hash allocation until vmalloc space is available. */ if (hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_EARLY | HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; } static void __init dcache_init(void) { /* * A constructor could be added for stable state like the lists, * but it is probably not worth it because of the cache nature * of the dcache. */ dentry_cache = KMEM_CACHE_USERCOPY(dentry, SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT, d_iname); /* Hash may have been set up in dcache_init_early */ if (!hashdist) return; dentry_hashtable = alloc_large_system_hash("Dentry cache", sizeof(struct hlist_bl_head), dhash_entries, 13, HASH_ZERO, &d_hash_shift, NULL, 0, 0); d_hash_shift = 32 - d_hash_shift; } /* SLAB cache for __getname() consumers */ struct kmem_cache *names_cachep __read_mostly; EXPORT_SYMBOL(names_cachep); void __init vfs_caches_init_early(void) { int i; for (i = 0; i < ARRAY_SIZE(in_lookup_hashtable); i++) INIT_HLIST_BL_HEAD(&in_lookup_hashtable[i]); dcache_init_early(); inode_init_early(); } void __init vfs_caches_init(void) { names_cachep = kmem_cache_create_usercopy("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, 0, PATH_MAX, NULL); dcache_init(); inode_init(); files_init(); files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 // SPDX-License-Identifier: GPL-2.0 /* * hrtimers - High-resolution kernel timers * * Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005, Red Hat, Inc., Ingo Molnar * * data type definitions, declarations, prototypes * * Started by: Thomas Gleixner and Ingo Molnar */ #ifndef _LINUX_HRTIMER_H #define _LINUX_HRTIMER_H #include <linux/hrtimer_defs.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/list.h> #include <linux/percpu.h> #include <linux/seqlock.h> #include <linux/timer.h> #include <linux/timerqueue.h> struct hrtimer_clock_base; struct hrtimer_cpu_base; /* * Mode arguments of xxx_hrtimer functions: * * HRTIMER_MODE_ABS - Time value is absolute * HRTIMER_MODE_REL - Time value is relative to now * HRTIMER_MODE_PINNED - Timer is bound to CPU (is only considered * when starting the timer) * HRTIMER_MODE_SOFT - Timer callback function will be executed in * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, HRTIMER_MODE_REL = 0x01, HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, HRTIMER_MODE_ABS_SOFT = HRTIMER_MODE_ABS | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_SOFT = HRTIMER_MODE_REL | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_PINNED_SOFT = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_REL_PINNED_SOFT = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_SOFT, HRTIMER_MODE_ABS_HARD = HRTIMER_MODE_ABS | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_HARD = HRTIMER_MODE_REL | HRTIMER_MODE_HARD, HRTIMER_MODE_ABS_PINNED_HARD = HRTIMER_MODE_ABS_PINNED | HRTIMER_MODE_HARD, HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; /* * Return values for the callback function */ enum hrtimer_restart { HRTIMER_NORESTART, /* Timer is not restarted */ HRTIMER_RESTART, /* Timer must be restarted */ }; /* * Values to track state of the timer * * Possible states: * * 0x00 inactive * 0x01 enqueued into rbtree * * The callback state is not part of the timer->state because clearing it would * mean touching the timer after the callback, this makes it impossible to free * the timer from the callback function. * * Therefore we track the callback state in: * * timer->base->cpu_base->running == timer * * On SMP it is possible to have a "callback function running and enqueued" * status. It happens for example when a posix timer expired and the callback * queued a signal. Between dropping the lock which protects the posix timer * and reacquiring the base lock of the hrtimer, another CPU can deliver the * signal and rearm the timer. * * All state transitions are protected by cpu_base->lock. */ #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 /** * struct hrtimer - the basic hrtimer structure * @node: timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding * slack to the _softexpires value. For non range timers * identical to _softexpires. * @_softexpires: the absolute earliest expiry time of the hrtimer. * The time which was given as expiry time when the timer * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. * * The hrtimer structure must be initialized by hrtimer_init() */ struct hrtimer { struct timerqueue_node node; ktime_t _softexpires; enum hrtimer_restart (*function)(struct hrtimer *); struct hrtimer_clock_base *base; u8 state; u8 is_rel; u8 is_soft; u8 is_hard; }; /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure * @task: task to wake up * * task is set to NULL, when the timer expires. */ struct hrtimer_sleeper { struct hrtimer timer; struct task_struct *task; }; #ifdef CONFIG_64BIT # define __hrtimer_clock_base_align ____cacheline_aligned #else # define __hrtimer_clock_base_align #endif /** * struct hrtimer_clock_base - the timer base for a specific clock * @cpu_base: per cpu clock base * @index: clock type index for per_cpu support when moving a * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @get_time: function to retrieve the current time of the clock * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; unsigned int index; clockid_t clockid; seqcount_raw_spinlock_t seq; struct hrtimer *running; struct timerqueue_head active; ktime_t (*get_time)(void); ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, HRTIMER_BASE_TAI, HRTIMER_BASE_MONOTONIC_SOFT, HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, HRTIMER_MAX_CLOCK_BASES, }; /** * struct hrtimer_cpu_base - the per cpu clock bases * @lock: lock protecting the base and associated clock bases * and timers * @cpu: cpu number * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode * @in_hrtirq: hrtimer_interrupt() is currently executing * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. * @nr_events: Total number of hrtimer interrupt events * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are * expired * @timer_waiters: A hrtimer_cancel() invocation waits for the timer * callback to finish. * @expires_next: absolute time of the next event, is required for remote * hrtimer enqueue; it is the total first expiry time (hard * and soft hrtimer are taken into account) * @next_timer: Pointer to the first expiring timer * @softirq_expires_next: Time to check, if soft queues needs also to be expired * @softirq_next_timer: Pointer to the first expiring softirq based timer * @clock_base: array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on * cross cpu removals. */ struct hrtimer_cpu_base { raw_spinlock_t lock; unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; unsigned int hres_active : 1, in_hrtirq : 1, hang_detected : 1, softirq_activated : 1; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; unsigned short nr_hangs; unsigned int max_hang_time; #endif #ifdef CONFIG_PREEMPT_RT spinlock_t softirq_expiry_lock; atomic_t timer_waiters; #endif ktime_t expires_next; struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; } ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = time; timer->_softexpires = time; } static inline void hrtimer_set_expires_range(struct hrtimer *timer, ktime_t time, ktime_t delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, delta); } static inline void hrtimer_set_expires_range_ns(struct hrtimer *timer, ktime_t time, u64 delta) { timer->_softexpires = time; timer->node.expires = ktime_add_safe(time, ns_to_ktime(delta)); } static inline void hrtimer_set_expires_tv64(struct hrtimer *timer, s64 tv64) { timer->node.expires = tv64; timer->_softexpires = tv64; } static inline void hrtimer_add_expires(struct hrtimer *timer, ktime_t time) { timer->node.expires = ktime_add_safe(timer->node.expires, time); timer->_softexpires = ktime_add_safe(timer->_softexpires, time); } static inline void hrtimer_add_expires_ns(struct hrtimer *timer, u64 ns) { timer->node.expires = ktime_add_ns(timer->node.expires, ns); timer->_softexpires = ktime_add_ns(timer->_softexpires, ns); } static inline ktime_t hrtimer_get_expires(const struct hrtimer *timer) { return timer->node.expires; } static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_tv64(const struct hrtimer *timer) { return timer->node.expires; } static inline s64 hrtimer_get_softexpires_tv64(const struct hrtimer *timer) { return timer->_softexpires; } static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) { return ktime_to_ns(timer->node.expires); } static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) { return ktime_sub(timer->node.expires, timer->base->get_time()); } static inline ktime_t hrtimer_cb_get_time(struct hrtimer *timer) { return timer->base->get_time(); } static inline int hrtimer_is_hres_active(struct hrtimer *timer) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? timer->base->cpu_base->hres_active : 0; } #ifdef CONFIG_HIGH_RES_TIMERS struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); extern unsigned int hrtimer_resolution; #else #define hrtimer_resolution (unsigned int)LOW_RES_NSEC #endif static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) { ktime_t rem = ktime_sub(timer->node.expires, now); /* * Adjust relative timers for the extra we added in * hrtimer_start_range_ns() to prevent short timeouts. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && timer->is_rel) rem -= hrtimer_resolution; return rem; } static inline ktime_t hrtimer_expires_remaining_adjusted(const struct hrtimer *timer) { return __hrtimer_expires_remaining_adjusted(timer, timer->base->get_time()); } #ifdef CONFIG_TIMERFD extern void timerfd_clock_was_set(void); #else static inline void timerfd_clock_was_set(void) { } #endif extern void hrtimers_resume(void); DECLARE_PER_CPU(struct tick_device, tick_cpu_device); #ifdef CONFIG_PREEMPT_RT void hrtimer_cancel_wait_running(const struct hrtimer *timer); #else static inline void hrtimer_cancel_wait_running(struct hrtimer *timer) { cpu_relax(); } #endif /* Exported timer functions: */ /* Initialize timers: */ extern void hrtimer_init(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS extern void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode); extern void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); extern void destroy_hrtimer_on_stack(struct hrtimer *timer); #else static inline void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t which_clock, enum hrtimer_mode mode) { hrtimer_init(timer, which_clock, mode); } static inline void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { hrtimer_init_sleeper(sl, clock_id, mode); } static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } #endif /* Basic timer operations: */ extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { hrtimer_start_range_ns(timer, tim, 0, mode); } extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { u64 delta; ktime_t soft, hard; soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode); static inline void hrtimer_restart(struct hrtimer *timer) { hrtimer_start_expires(timer, HRTIMER_MODE_ABS); } /* Query timers: */ extern ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust); static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) { return __hrtimer_get_remaining(timer, false); } extern u64 hrtimer_get_next_event(void); extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); /** * hrtimer_is_queued = check, whether the timer is on one of the queues * @timer: Timer to check * * Returns: True if the timer is queued, false otherwise * * The function can be used lockless, but it gives only a current snapshot. */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { /* The READ_ONCE pairs with the update functions of timer->state */ return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); } /* * Helper function to check, whether the timer is running the callback * function */ static inline int hrtimer_callback_running(struct hrtimer *timer) { return timer->base->running == timer; } /* Forward a hrtimer so it expires after now: */ extern u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval); /** * hrtimer_forward_now - forward the timer expiry so it expires after now * @timer: hrtimer to forward * @interval: the interval to forward * * Forward the timer expiry so it will expire after the current time * of the hrtimer clock base. Returns the number of overruns. * * Can be safely called from the callback function of @timer. If * called from other contexts @timer must neither be enqueued nor * running the callback and the caller needs to take care of * serialization. * * Note: This only updates the timer expiry value and does not requeue * the timer. */ static inline u64 hrtimer_forward_now(struct hrtimer *timer, ktime_t interval) { return hrtimer_forward(timer, timer->base->get_time(), interval); } /* Precise sleep: */ extern int nanosleep_copyout(struct restart_block *, struct timespec64 *); extern long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid); extern int schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode); extern int schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id); extern int schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode); /* Soft interrupt function to run the hrtimer queues: */ extern void hrtimer_run_queues(void); /* Bootup initialization: */ extern void __init hrtimers_init(void); /* Show pending timers: */ extern void sysrq_timer_list_show(void); int hrtimers_prepare_cpu(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU int hrtimers_dead_cpu(unsigned int cpu); #else #define hrtimers_dead_cpu NULL #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 /* SPDX-License-Identifier: GPL-2.0+ */ #ifndef _LINUX_XARRAY_H #define _LINUX_XARRAY_H /* * eXtensible Arrays * Copyright (c) 2017 Microsoft Corporation * Author: Matthew Wilcox <willy@infradead.org> * * See Documentation/core-api/xarray.rst for how to use the XArray. */ #include <linux/bug.h> #include <linux/compiler.h> #include <linux/gfp.h> #include <linux/kconfig.h> #include <linux/kernel.h> #include <linux/rcupdate.h> #include <linux/spinlock.h> #include <linux/types.h> /* * The bottom two bits of the entry determine how the XArray interprets * the contents: * * 00: Pointer entry * 10: Internal entry * x1: Value entry or tagged pointer * * Attempting to store internal entries in the XArray is a bug. * * Most internal entries are pointers to the next node in the tree. * The following internal entries have a special meaning: * * 0-62: Sibling entries * 256: Retry entry * 257: Zero entry * * Errors are also represented as internal entries, but use the negative * space (-4094 to -2). They're never stored in the slots array; only * returned by the normal API. */ #define BITS_PER_XA_VALUE (BITS_PER_LONG - 1) /** * xa_mk_value() - Create an XArray entry from an integer. * @v: Value to store in XArray. * * Context: Any context. * Return: An entry suitable for storing in the XArray. */ static inline void *xa_mk_value(unsigned long v) { WARN_ON((long)v < 0); return (void *)((v << 1) | 1); } /** * xa_to_value() - Get value stored in an XArray entry. * @entry: XArray entry. * * Context: Any context. * Return: The value stored in the XArray entry. */ static inline unsigned long xa_to_value(const void *entry) { return (unsigned long)entry >> 1; } /** * xa_is_value() - Determine if an entry is a value. * @entry: XArray entry. * * Context: Any context. * Return: True if the entry is a value, false if it is a pointer. */ static inline bool xa_is_value(const void *entry) { return (unsigned long)entry & 1; } /** * xa_tag_pointer() - Create an XArray entry for a tagged pointer. * @p: Plain pointer. * @tag: Tag value (0, 1 or 3). * * If the user of the XArray prefers, they can tag their pointers instead * of storing value entries. Three tags are available (0, 1 and 3). * These are distinct from the xa_mark_t as they are not replicated up * through the array and cannot be searched for. * * Context: Any context. * Return: An XArray entry. */ static inline void *xa_tag_pointer(void *p, unsigned long tag) { return (void *)((unsigned long)p | tag); } /** * xa_untag_pointer() - Turn an XArray entry into a plain pointer. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the untagged version of the pointer. * * Context: Any context. * Return: A pointer. */ static inline void *xa_untag_pointer(void *entry) { return (void *)((unsigned long)entry & ~3UL); } /** * xa_pointer_tag() - Get the tag stored in an XArray entry. * @entry: XArray entry. * * If you have stored a tagged pointer in the XArray, call this function * to get the tag of that pointer. * * Context: Any context. * Return: A tag. */ static inline unsigned int xa_pointer_tag(void *entry) { return (unsigned long)entry & 3UL; } /* * xa_mk_internal() - Create an internal entry. * @v: Value to turn into an internal entry. * * Internal entries are used for a number of purposes. Entries 0-255 are * used for sibling entries (only 0-62 are used by the current code). 256 * is used for the retry entry. 257 is used for the reserved / zero entry. * Negative internal entries are used to represent errnos. Node pointers * are also tagged as internal entries in some situations. * * Context: Any context. * Return: An XArray internal entry corresponding to this value. */ static inline void *xa_mk_internal(unsigned long v) { return (void *)((v << 2) | 2); } /* * xa_to_internal() - Extract the value from an internal entry. * @entry: XArray entry. * * Context: Any context. * Return: The value which was stored in the internal entry. */ static inline unsigned long xa_to_internal(const void *entry) { return (unsigned long)entry >> 2; } /* * xa_is_internal() - Is the entry an internal entry? * @entry: XArray entry. * * Context: Any context. * Return: %true if the entry is an internal entry. */ static inline bool xa_is_internal(const void *entry) { return ((unsigned long)entry & 3) == 2; } #define XA_ZERO_ENTRY xa_mk_internal(257) /** * xa_is_zero() - Is the entry a zero entry? * @entry: Entry retrieved from the XArray * * The normal API will return NULL as the contents of a slot containing * a zero entry. You can only see zero entries by using the advanced API. * * Return: %true if the entry is a zero entry. */ static inline bool xa_is_zero(const void *entry) { return unlikely(entry == XA_ZERO_ENTRY); } /** * xa_is_err() - Report whether an XArray operation returned an error * @entry: Result from calling an XArray function * * If an XArray operation cannot complete an operation, it will return * a special value indicating an error. This function tells you * whether an error occurred; xa_err() tells you which error occurred. * * Context: Any context. * Return: %true if the entry indicates an error. */ static inline bool xa_is_err(const void *entry) { return unlikely(xa_is_internal(entry) && entry >= xa_mk_internal(-MAX_ERRNO)); } /** * xa_err() - Turn an XArray result into an errno. * @entry: Result from calling an XArray function. * * If an XArray operation cannot complete an operation, it will return * a special pointer value which encodes an errno. This function extracts * the errno from the pointer value, or returns 0 if the pointer does not * represent an errno. * * Context: Any context. * Return: A negative errno or 0. */ static inline int xa_err(void *entry) { /* xa_to_internal() would not do sign extension. */ if (xa_is_err(entry)) return (long)entry >> 2; return 0; } /** * struct xa_limit - Represents a range of IDs. * @min: The lowest ID to allocate (inclusive). * @max: The maximum ID to allocate (inclusive). * * This structure is used either directly or via the XA_LIMIT() macro * to communicate the range of IDs that are valid for allocation. * Two common ranges are predefined for you: * * xa_limit_32b - [0 - UINT_MAX] * * xa_limit_31b - [0 - INT_MAX] */ struct xa_limit { u32 max; u32 min; }; #define XA_LIMIT(_min, _max) (struct xa_limit) { .min = _min, .max = _max } #define xa_limit_32b XA_LIMIT(0, UINT_MAX) #define xa_limit_31b XA_LIMIT(0, INT_MAX) typedef unsigned __bitwise xa_mark_t; #define XA_MARK_0 ((__force xa_mark_t)0U) #define XA_MARK_1 ((__force xa_mark_t)1U) #define XA_MARK_2 ((__force xa_mark_t)2U) #define XA_PRESENT ((__force xa_mark_t)8U) #define XA_MARK_MAX XA_MARK_2 #define XA_FREE_MARK XA_MARK_0 enum xa_lock_type { XA_LOCK_IRQ = 1, XA_LOCK_BH = 2, }; /* * Values for xa_flags. The radix tree stores its GFP flags in the xa_flags, * and we remain compatible with that. */ #define XA_FLAGS_LOCK_IRQ ((__force gfp_t)XA_LOCK_IRQ) #define XA_FLAGS_LOCK_BH ((__force gfp_t)XA_LOCK_BH) #define XA_FLAGS_TRACK_FREE ((__force gfp_t)4U) #define XA_FLAGS_ZERO_BUSY ((__force gfp_t)8U) #define XA_FLAGS_ALLOC_WRAPPED ((__force gfp_t)16U) #define XA_FLAGS_ACCOUNT ((__force gfp_t)32U) #define XA_FLAGS_MARK(mark) ((__force gfp_t)((1U << __GFP_BITS_SHIFT) << \ (__force unsigned)(mark))) /* ALLOC is for a normal 0-based alloc. ALLOC1 is for an 1-based alloc */ #define XA_FLAGS_ALLOC (XA_FLAGS_TRACK_FREE | XA_FLAGS_MARK(XA_FREE_MARK)) #define XA_FLAGS_ALLOC1 (XA_FLAGS_TRACK_FREE | XA_FLAGS_ZERO_BUSY) /** * struct xarray - The anchor of the XArray. * @xa_lock: Lock that protects the contents of the XArray. * * To use the xarray, define it statically or embed it in your data structure. * It is a very small data structure, so it does not usually make sense to * allocate it separately and keep a pointer to it in your data structure. * * You may use the xa_lock to protect your own data structures as well. */ /* * If all of the entries in the array are NULL, @xa_head is a NULL pointer. * If the only non-NULL entry in the array is at index 0, @xa_head is that * entry. If any other entry in the array is non-NULL, @xa_head points * to an @xa_node. */ struct xarray { spinlock_t xa_lock; /* private: The rest of the data structure is not to be used directly. */ gfp_t xa_flags; void __rcu * xa_head; }; #define XARRAY_INIT(name, flags) { \ .xa_lock = __SPIN_LOCK_UNLOCKED(name.xa_lock), \ .xa_flags = flags, \ .xa_head = NULL, \ } /** * DEFINE_XARRAY_FLAGS() - Define an XArray with custom flags. * @name: A string that names your XArray. * @flags: XA_FLAG values. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name and flags. It is * equivalent to calling xa_init_flags() on the array, but it does the * initialisation at compiletime instead of runtime. */ #define DEFINE_XARRAY_FLAGS(name, flags) \ struct xarray name = XARRAY_INIT(name, flags) /** * DEFINE_XARRAY() - Define an XArray. * @name: A string that names your XArray. * * This is intended for file scope definitions of XArrays. It declares * and initialises an empty XArray with the chosen name. It is equivalent * to calling xa_init() on the array, but it does the initialisation at * compiletime instead of runtime. */ #define DEFINE_XARRAY(name) DEFINE_XARRAY_FLAGS(name, 0) /** * DEFINE_XARRAY_ALLOC() - Define an XArray which allocates IDs starting at 0. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC) /** * DEFINE_XARRAY_ALLOC1() - Define an XArray which allocates IDs starting at 1. * @name: A string that names your XArray. * * This is intended for file scope definitions of allocating XArrays. * See also DEFINE_XARRAY(). */ #define DEFINE_XARRAY_ALLOC1(name) DEFINE_XARRAY_FLAGS(name, XA_FLAGS_ALLOC1) void *xa_load(struct xarray *, unsigned long index); void *xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *xa_erase(struct xarray *, unsigned long index); void *xa_store_range(struct xarray *, unsigned long first, unsigned long last, void *entry, gfp_t); bool xa_get_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); void *xa_find(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); void *xa_find_after(struct xarray *xa, unsigned long *index, unsigned long max, xa_mark_t) __attribute__((nonnull(2))); unsigned int xa_extract(struct xarray *, void **dst, unsigned long start, unsigned long max, unsigned int n, xa_mark_t); void xa_destroy(struct xarray *); /** * xa_init_flags() - Initialise an empty XArray with flags. * @xa: XArray. * @flags: XA_FLAG values. * * If you need to initialise an XArray with special flags (eg you need * to take the lock from interrupt context), use this function instead * of xa_init(). * * Context: Any context. */ static inline void xa_init_flags(struct xarray *xa, gfp_t flags) { spin_lock_init(&xa->xa_lock); xa->xa_flags = flags; xa->xa_head = NULL; } /** * xa_init() - Initialise an empty XArray. * @xa: XArray. * * An empty XArray is full of NULL entries. * * Context: Any context. */ static inline void xa_init(struct xarray *xa) { xa_init_flags(xa, 0); } /** * xa_empty() - Determine if an array has any present entries. * @xa: XArray. * * Context: Any context. * Return: %true if the array contains only NULL pointers. */ static inline bool xa_empty(const struct xarray *xa) { return xa->xa_head == NULL; } /** * xa_marked() - Inquire whether any entry in this array has a mark set * @xa: Array * @mark: Mark value * * Context: Any context. * Return: %true if any entry has this mark set. */ static inline bool xa_marked(const struct xarray *xa, xa_mark_t mark) { return xa->xa_flags & XA_FLAGS_MARK(mark); } /** * xa_for_each_range() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * @last: Last index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_range() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_range() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_range(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_range(xa, index, entry, start, last) \ for (index = start, \ entry = xa_find(xa, &index, last, XA_PRESENT); \ entry; \ entry = xa_find_after(xa, &index, last, XA_PRESENT)) /** * xa_for_each_start() - Iterate over a portion of an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @start: First index to retrieve from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you * want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set * to NULL and @index will have a value less than or equal to max. * * xa_for_each_start() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). * xa_for_each_start() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each() iterator instead. * The xas_for_each() iterator will expand into more inline code than * xa_for_each_start(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_start(xa, index, entry, start) \ xa_for_each_range(xa, index, entry, start, ULONG_MAX) /** * xa_for_each() - Iterate over present entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. You may modify @index during the iteration if you want * to skip or reprocess indices. It is safe to modify the array during the * iteration. At the end of the iteration, @entry will be set to NULL and * @index will have a value less than or equal to max. * * xa_for_each() is O(n.log(n)) while xas_for_each() is O(n). You have * to handle your own locking with xas_for_each(), and if you have to unlock * after each iteration, it will also end up being O(n.log(n)). xa_for_each() * will spin if it hits a retry entry; if you intend to see retry entries, * you should use the xas_for_each() iterator instead. The xas_for_each() * iterator will expand into more inline code than xa_for_each(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each(xa, index, entry) \ xa_for_each_start(xa, index, entry, 0) /** * xa_for_each_marked() - Iterate over marked entries in an XArray. * @xa: XArray. * @index: Index of @entry. * @entry: Entry retrieved from array. * @filter: Selection criterion. * * During the iteration, @entry will have the value of the entry stored * in @xa at @index. The iteration will skip all entries in the array * which do not match @filter. You may modify @index during the iteration * if you want to skip or reprocess indices. It is safe to modify the array * during the iteration. At the end of the iteration, @entry will be set to * NULL and @index will have a value less than or equal to max. * * xa_for_each_marked() is O(n.log(n)) while xas_for_each_marked() is O(n). * You have to handle your own locking with xas_for_each(), and if you have * to unlock after each iteration, it will also end up being O(n.log(n)). * xa_for_each_marked() will spin if it hits a retry entry; if you intend to * see retry entries, you should use the xas_for_each_marked() iterator * instead. The xas_for_each_marked() iterator will expand into more inline * code than xa_for_each_marked(). * * Context: Any context. Takes and releases the RCU lock. */ #define xa_for_each_marked(xa, index, entry, filter) \ for (index = 0, entry = xa_find(xa, &index, ULONG_MAX, filter); \ entry; entry = xa_find_after(xa, &index, ULONG_MAX, filter)) #define xa_trylock(xa) spin_trylock(&(xa)->xa_lock) #define xa_lock(xa) spin_lock(&(xa)->xa_lock) #define xa_unlock(xa) spin_unlock(&(xa)->xa_lock) #define xa_lock_bh(xa) spin_lock_bh(&(xa)->xa_lock) #define xa_unlock_bh(xa) spin_unlock_bh(&(xa)->xa_lock) #define xa_lock_irq(xa) spin_lock_irq(&(xa)->xa_lock) #define xa_unlock_irq(xa) spin_unlock_irq(&(xa)->xa_lock) #define xa_lock_irqsave(xa, flags) \ spin_lock_irqsave(&(xa)->xa_lock, flags) #define xa_unlock_irqrestore(xa, flags) \ spin_unlock_irqrestore(&(xa)->xa_lock, flags) #define xa_lock_nested(xa, subclass) \ spin_lock_nested(&(xa)->xa_lock, subclass) #define xa_lock_bh_nested(xa, subclass) \ spin_lock_bh_nested(&(xa)->xa_lock, subclass) #define xa_lock_irq_nested(xa, subclass) \ spin_lock_irq_nested(&(xa)->xa_lock, subclass) #define xa_lock_irqsave_nested(xa, flags, subclass) \ spin_lock_irqsave_nested(&(xa)->xa_lock, flags, subclass) /* * Versions of the normal API which require the caller to hold the * xa_lock. If the GFP flags allow it, they will drop the lock to * allocate memory, then reacquire it afterwards. These functions * may also re-enable interrupts if the XArray flags indicate the * locking should be interrupt safe. */ void *__xa_erase(struct xarray *, unsigned long index); void *__xa_store(struct xarray *, unsigned long index, void *entry, gfp_t); void *__xa_cmpxchg(struct xarray *, unsigned long index, void *old, void *entry, gfp_t); int __must_check __xa_insert(struct xarray *, unsigned long index, void *entry, gfp_t); int __must_check __xa_alloc(struct xarray *, u32 *id, void *entry, struct xa_limit, gfp_t); int __must_check __xa_alloc_cyclic(struct xarray *, u32 *id, void *entry, struct xa_limit, u32 *next, gfp_t); void __xa_set_mark(struct xarray *, unsigned long index, xa_mark_t); void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); /** * xa_store_bh() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock_bh(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_store_irq() - Store this entry in the XArray. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * This function is like calling xa_store() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { void *curr; xa_lock_irq(xa); curr = __xa_store(xa, index, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_erase_bh() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: The entry which used to be at this index. */ static inline void *xa_erase_bh(struct xarray *xa, unsigned long index) { void *entry; xa_lock_bh(xa); entry = __xa_erase(xa, index); xa_unlock_bh(xa); return entry; } /** * xa_erase_irq() - Erase this entry from the XArray. * @xa: XArray. * @index: Index of entry. * * After this function returns, loading from @index will return %NULL. * If the index is part of a multi-index entry, all indices will be erased * and none of the entries will be part of a multi-index entry. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: The entry which used to be at this index. */ static inline void *xa_erase_irq(struct xarray *xa, unsigned long index) { void *entry; xa_lock_irq(xa); entry = __xa_erase(xa, index); xa_unlock_irq(xa); return entry; } /** * xa_cmpxchg() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * If the entry at @index is the same as @old, replace it with @entry. * If the return value is equal to @old, then the exchange was successful. * * Context: Any context. Takes and releases the xa_lock. May sleep * if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; xa_lock(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock(xa); return curr; } /** * xa_cmpxchg_bh() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables softirqs * while holding the array lock. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_bh(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; xa_lock_bh(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_bh(xa); return curr; } /** * xa_cmpxchg_irq() - Conditionally replace an entry in the XArray. * @xa: XArray. * @index: Index into array. * @old: Old value to test against. * @entry: New value to place in array. * @gfp: Memory allocation flags. * * This function is like calling xa_cmpxchg() except it disables interrupts * while holding the array lock. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: The old value at this index or xa_err() if an error happened. */ static inline void *xa_cmpxchg_irq(struct xarray *xa, unsigned long index, void *old, void *entry, gfp_t gfp) { void *curr; xa_lock_irq(xa); curr = __xa_cmpxchg(xa, index, old, entry, gfp); xa_unlock_irq(xa); return curr; } /** * xa_insert() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; xa_lock(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock(xa); return err; } /** * xa_insert_bh() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; xa_lock_bh(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_bh(xa); return err; } /** * xa_insert_irq() - Store this entry in the XArray unless another entry is * already present. * @xa: XArray. * @index: Index into array. * @entry: New entry. * @gfp: Memory allocation flags. * * Inserting a NULL entry will store a reserved entry (like xa_reserve()) * if no entry is present. Inserting will fail if a reserved entry is * present, even though loading from this index will return NULL. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the store succeeded. -EBUSY if another entry was present. * -ENOMEM if memory could not be allocated. */ static inline int __must_check xa_insert_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) { int err; xa_lock_irq(xa); err = __xa_insert(xa, index, entry, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline __must_check int xa_alloc(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; xa_lock(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock(xa); return err; } /** * xa_alloc_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; xa_lock_bh(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 on success, -ENOMEM if memory could not be allocated or * -EBUSY if there are no free entries in @limit. */ static inline int __must_check xa_alloc_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, gfp_t gfp) { int err; xa_lock_irq(xa); err = __xa_alloc(xa, id, entry, limit, gfp); xa_unlock_irq(xa); return err; } /** * xa_alloc_cyclic() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Context: Any context. Takes and releases the xa_lock. May sleep if * the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; xa_lock(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock(xa); return err; } /** * xa_alloc_cyclic_bh() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_bh(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; xa_lock_bh(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_bh(xa); return err; } /** * xa_alloc_cyclic_irq() - Find somewhere to store this entry in the XArray. * @xa: XArray. * @id: Pointer to ID. * @entry: New entry. * @limit: Range of allocated ID. * @next: Pointer to next ID to allocate. * @gfp: Memory allocation flags. * * Finds an empty entry in @xa between @limit.min and @limit.max, * stores the index into the @id pointer, then stores the entry at * that index. A concurrent lookup will not see an uninitialised @id. * The search for an empty entry will start at @next and will wrap * around if necessary. * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. May sleep if the @gfp flags permit. * Return: 0 if the allocation succeeded without wrapping. 1 if the * allocation succeeded after wrapping, -ENOMEM if memory could not be * allocated or -EBUSY if there are no free entries in @limit. */ static inline int xa_alloc_cyclic_irq(struct xarray *xa, u32 *id, void *entry, struct xa_limit limit, u32 *next, gfp_t gfp) { int err; xa_lock_irq(xa); err = __xa_alloc_cyclic(xa, id, entry, limit, next, gfp); xa_unlock_irq(xa); return err; } /** * xa_reserve() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * Ensures there is somewhere to store an entry at @index in the array. * If there is already something stored at @index, this function does * nothing. If there was nothing there, the entry is marked as reserved. * Loading from a reserved entry returns a %NULL pointer. * * If you do not use the entry that you have reserved, call xa_release() * or xa_erase() to free any unnecessary memory. * * Context: Any context. Takes and releases the xa_lock. * May sleep if the @gfp flags permit. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_bh() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * A softirq-disabling version of xa_reserve(). * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_bh(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_bh(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_reserve_irq() - Reserve this index in the XArray. * @xa: XArray. * @index: Index into array. * @gfp: Memory allocation flags. * * An interrupt-disabling version of xa_reserve(). * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. * Return: 0 if the reservation succeeded or -ENOMEM if it failed. */ static inline __must_check int xa_reserve_irq(struct xarray *xa, unsigned long index, gfp_t gfp) { return xa_err(xa_cmpxchg_irq(xa, index, NULL, XA_ZERO_ENTRY, gfp)); } /** * xa_release() - Release a reserved entry. * @xa: XArray. * @index: Index of entry. * * After calling xa_reserve(), you can call this function to release the * reservation. If the entry at @index has been stored to, this function * will do nothing. */ static inline void xa_release(struct xarray *xa, unsigned long index) { xa_cmpxchg(xa, index, XA_ZERO_ENTRY, NULL, 0); } /* Everything below here is the Advanced API. Proceed with caution. */ /* * The xarray is constructed out of a set of 'chunks' of pointers. Choosing * the best chunk size requires some tradeoffs. A power of two recommends * itself so that we can walk the tree based purely on shifts and masks. * Generally, the larger the better; as the number of slots per level of the * tree increases, the less tall the tree needs to be. But that needs to be * balanced against the memory consumption of each node. On a 64-bit system, * xa_node is currently 576 bytes, and we get 7 of them per 4kB page. If we * doubled the number of slots per node, we'd get only 3 nodes per 4kB page. */ #ifndef XA_CHUNK_SHIFT #define XA_CHUNK_SHIFT (CONFIG_BASE_SMALL ? 4 : 6) #endif #define XA_CHUNK_SIZE (1UL << XA_CHUNK_SHIFT) #define XA_CHUNK_MASK (XA_CHUNK_SIZE - 1) #define XA_MAX_MARKS 3 #define XA_MARK_LONGS DIV_ROUND_UP(XA_CHUNK_SIZE, BITS_PER_LONG) /* * @count is the count of every non-NULL element in the ->slots array * whether that is a value entry, a retry entry, a user pointer, * a sibling entry or a pointer to the next level of the tree. * @nr_values is the count of every element in ->slots which is * either a value entry or a sibling of a value entry. */ struct xa_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char nr_values; /* Value entry count */ struct xa_node __rcu *parent; /* NULL at top of tree */ struct xarray *array; /* The array we belong to */ union { struct list_head private_list; /* For tree user */ struct rcu_head rcu_head; /* Used when freeing node */ }; void __rcu *slots[XA_CHUNK_SIZE]; union { unsigned long tags[XA_MAX_MARKS][XA_MARK_LONGS]; unsigned long marks[XA_MAX_MARKS][XA_MARK_LONGS]; }; }; void xa_dump(const struct xarray *); void xa_dump_node(const struct xa_node *); #ifdef XA_DEBUG #define XA_BUG_ON(xa, x) do { \ if (x) { \ xa_dump(xa); \ BUG(); \ } \ } while (0) #define XA_NODE_BUG_ON(node, x) do { \ if (x) { \ if (node) xa_dump_node(node); \ BUG(); \ } \ } while (0) #else #define XA_BUG_ON(xa, x) do { } while (0) #define XA_NODE_BUG_ON(node, x) do { } while (0) #endif /* Private */ static inline void *xa_head(const struct xarray *xa) { return rcu_dereference_check(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_head_locked(const struct xarray *xa) { return rcu_dereference_protected(xa->xa_head, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_check(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_entry_locked(const struct xarray *xa, const struct xa_node *node, unsigned int offset) { XA_NODE_BUG_ON(node, offset >= XA_CHUNK_SIZE); return rcu_dereference_protected(node->slots[offset], lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_check(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline struct xa_node *xa_parent_locked(const struct xarray *xa, const struct xa_node *node) { return rcu_dereference_protected(node->parent, lockdep_is_held(&xa->xa_lock)); } /* Private */ static inline void *xa_mk_node(const struct xa_node *node) { return (void *)((unsigned long)node | 2); } /* Private */ static inline struct xa_node *xa_to_node(const void *entry) { return (struct xa_node *)((unsigned long)entry - 2); } /* Private */ static inline bool xa_is_node(const void *entry) { return xa_is_internal(entry) && (unsigned long)entry > 4096; } /* Private */ static inline void *xa_mk_sibling(unsigned int offset) { return xa_mk_internal(offset); } /* Private */ static inline unsigned long xa_to_sibling(const void *entry) { return xa_to_internal(entry); } /** * xa_is_sibling() - Is the entry a sibling entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a sibling entry. */ static inline bool xa_is_sibling(const void *entry) { return IS_ENABLED(CONFIG_XARRAY_MULTI) && xa_is_internal(entry) && (entry < xa_mk_sibling(XA_CHUNK_SIZE - 1)); } #define XA_RETRY_ENTRY xa_mk_internal(256) /** * xa_is_retry() - Is the entry a retry entry? * @entry: Entry retrieved from the XArray * * Return: %true if the entry is a retry entry. */ static inline bool xa_is_retry(const void *entry) { return unlikely(entry == XA_RETRY_ENTRY); } /** * xa_is_advanced() - Is the entry only permitted for the advanced API? * @entry: Entry to be stored in the XArray. * * Return: %true if the entry cannot be stored by the normal API. */ static inline bool xa_is_advanced(const void *entry) { return xa_is_internal(entry) && (entry <= XA_RETRY_ENTRY); } /** * typedef xa_update_node_t - A callback function from the XArray. * @node: The node which is being processed * * This function is called every time the XArray updates the count of * present and value entries in a node. It allows advanced users to * maintain the private_list in the node. * * Context: The xa_lock is held and interrupts may be disabled. * Implementations should not drop the xa_lock, nor re-enable * interrupts. */ typedef void (*xa_update_node_t)(struct xa_node *node); void xa_delete_node(struct xa_node *, xa_update_node_t); /* * The xa_state is opaque to its users. It contains various different pieces * of state involved in the current operation on the XArray. It should be * declared on the stack and passed between the various internal routines. * The various elements in it should not be accessed directly, but only * through the provided accessor functions. The below documentation is for * the benefit of those working on the code, not for users of the XArray. * * @xa_node usually points to the xa_node containing the slot we're operating * on (and @xa_offset is the offset in the slots array). If there is a * single entry in the array at index 0, there are no allocated xa_nodes to * point to, and so we store %NULL in @xa_node. @xa_node is set to * the value %XAS_RESTART if the xa_state is not walked to the correct * position in the tree of nodes for this operation. If an error occurs * during an operation, it is set to an %XAS_ERROR value. If we run off the * end of the allocated nodes, it is set to %XAS_BOUNDS. */ struct xa_state { struct xarray *xa; unsigned long xa_index; unsigned char xa_shift; unsigned char xa_sibs; unsigned char xa_offset; unsigned char xa_pad; /* Helps gcc generate better code */ struct xa_node *xa_node; struct xa_node *xa_alloc; xa_update_node_t xa_update; }; /* * We encode errnos in the xas->xa_node. If an error has happened, we need to * drop the lock to fix it, and once we've done so the xa_state is invalid. */ #define XA_ERROR(errno) ((struct xa_node *)(((unsigned long)errno << 2) | 2UL)) #define XAS_BOUNDS ((struct xa_node *)1UL) #define XAS_RESTART ((struct xa_node *)3UL) #define __XA_STATE(array, index, shift, sibs) { \ .xa = array, \ .xa_index = index, \ .xa_shift = shift, \ .xa_sibs = sibs, \ .xa_offset = 0, \ .xa_pad = 0, \ .xa_node = XAS_RESTART, \ .xa_alloc = NULL, \ .xa_update = NULL \ } /** * XA_STATE() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * * Declare and initialise an xa_state on the stack. */ #define XA_STATE(name, array, index) \ struct xa_state name = __XA_STATE(array, index, 0, 0) /** * XA_STATE_ORDER() - Declare an XArray operation state. * @name: Name of this operation state (usually xas). * @array: Array to operate on. * @index: Initial index of interest. * @order: Order of entry. * * Declare and initialise an xa_state on the stack. This variant of * XA_STATE() allows you to specify the 'order' of the element you * want to operate on.` */ #define XA_STATE_ORDER(name, array, index, order) \ struct xa_state name = __XA_STATE(array, \ (index >> order) << order, \ order - (order % XA_CHUNK_SHIFT), \ (1U << (order % XA_CHUNK_SHIFT)) - 1) #define xas_marked(xas, mark) xa_marked((xas)->xa, (mark)) #define xas_trylock(xas) xa_trylock((xas)->xa) #define xas_lock(xas) xa_lock((xas)->xa) #define xas_unlock(xas) xa_unlock((xas)->xa) #define xas_lock_bh(xas) xa_lock_bh((xas)->xa) #define xas_unlock_bh(xas) xa_unlock_bh((xas)->xa) #define xas_lock_irq(xas) xa_lock_irq((xas)->xa) #define xas_unlock_irq(xas) xa_unlock_irq((xas)->xa) #define xas_lock_irqsave(xas, flags) \ xa_lock_irqsave((xas)->xa, flags) #define xas_unlock_irqrestore(xas, flags) \ xa_unlock_irqrestore((xas)->xa, flags) /** * xas_error() - Return an errno stored in the xa_state. * @xas: XArray operation state. * * Return: 0 if no error has been noted. A negative errno if one has. */ static inline int xas_error(const struct xa_state *xas) { return xa_err(xas->xa_node); } /** * xas_set_err() - Note an error in the xa_state. * @xas: XArray operation state. * @err: Negative error number. * * Only call this function with a negative @err; zero or positive errors * will probably not behave the way you think they should. If you want * to clear the error from an xa_state, use xas_reset(). */ static inline void xas_set_err(struct xa_state *xas, long err) { xas->xa_node = XA_ERROR(err); } /** * xas_invalid() - Is the xas in a retry or error state? * @xas: XArray operation state. * * Return: %true if the xas cannot be used for operations. */ static inline bool xas_invalid(const struct xa_state *xas) { return (unsigned long)xas->xa_node & 3; } /** * xas_valid() - Is the xas a valid cursor into the array? * @xas: XArray operation state. * * Return: %true if the xas can be used for operations. */ static inline bool xas_valid(const struct xa_state *xas) { return !xas_invalid(xas); } /** * xas_is_node() - Does the xas point to a node? * @xas: XArray operation state. * * Return: %true if the xas currently references a node. */ static inline bool xas_is_node(const struct xa_state *xas) { return xas_valid(xas) && xas->xa_node; } /* True if the pointer is something other than a node */ static inline bool xas_not_node(struct xa_node *node) { return ((unsigned long)node & 3) || !node; } /* True if the node represents RESTART or an error */ static inline bool xas_frozen(struct xa_node *node) { return (unsigned long)node & 2; } /* True if the node represents head-of-tree, RESTART or BOUNDS */ static inline bool xas_top(struct xa_node *node) { return node <= XAS_RESTART; } /** * xas_reset() - Reset an XArray operation state. * @xas: XArray operation state. * * Resets the error or walk state of the @xas so future walks of the * array will start from the root. Use this if you have dropped the * xarray lock and want to reuse the xa_state. * * Context: Any context. */ static inline void xas_reset(struct xa_state *xas) { xas->xa_node = XAS_RESTART; } /** * xas_retry() - Retry the operation if appropriate. * @xas: XArray operation state. * @entry: Entry from xarray. * * The advanced functions may sometimes return an internal entry, such as * a retry entry or a zero entry. This function sets up the @xas to restart * the walk from the head of the array if needed. * * Context: Any context. * Return: true if the operation needs to be retried. */ static inline bool xas_retry(struct xa_state *xas, const void *entry) { if (xa_is_zero(entry)) return true; if (!xa_is_retry(entry)) return false; xas_reset(xas); return true; } void *xas_load(struct xa_state *); void *xas_store(struct xa_state *, void *entry); void *xas_find(struct xa_state *, unsigned long max); void *xas_find_conflict(struct xa_state *); bool xas_get_mark(const struct xa_state *, xa_mark_t); void xas_set_mark(const struct xa_state *, xa_mark_t); void xas_clear_mark(const struct xa_state *, xa_mark_t); void *xas_find_marked(struct xa_state *, unsigned long max, xa_mark_t); void xas_init_marks(const struct xa_state *); bool xas_nomem(struct xa_state *, gfp_t); void xas_pause(struct xa_state *); void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else static inline int xa_get_order(struct xarray *xa, unsigned long index) { return 0; } static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { xas_store(xas, entry); } static inline void xas_split_alloc(struct xa_state *xas, void *entry, unsigned int order, gfp_t gfp) { } #endif /** * xas_reload() - Refetch an entry from the xarray. * @xas: XArray operation state. * * Use this function to check that a previously loaded entry still has * the same value. This is useful for the lockless pagecache lookup where * we walk the array with only the RCU lock to protect us, lock the page, * then check that the page hasn't moved since we looked it up. * * The caller guarantees that @xas is still valid. If it may be in an * error or restart state, call xas_load() instead. * * Return: The entry at this location in the xarray. */ static inline void *xas_reload(struct xa_state *xas) { struct xa_node *node = xas->xa_node; void *entry; char offset; if (!node) return xa_head(xas->xa); if (IS_ENABLED(CONFIG_XARRAY_MULTI)) { offset = (xas->xa_index >> node->shift) & XA_CHUNK_MASK; entry = xa_entry(xas->xa, node, offset); if (!xa_is_sibling(entry)) return entry; offset = xa_to_sibling(entry); } else { offset = xas->xa_offset; } return xa_entry(xas->xa, node, offset); } /** * xas_set() - Set up XArray operation state for a different index. * @xas: XArray operation state. * @index: New index into the XArray. * * Move the operation state to refer to a different index. This will * have the effect of starting a walk from the top; see xas_next() * to move to an adjacent index. */ static inline void xas_set(struct xa_state *xas, unsigned long index) { xas->xa_index = index; xas->xa_node = XAS_RESTART; } /** * xas_set_order() - Set up XArray operation state for a multislot entry. * @xas: XArray operation state. * @index: Target of the operation. * @order: Entry occupies 2^@order indices. */ static inline void xas_set_order(struct xa_state *xas, unsigned long index, unsigned int order) { #ifdef CONFIG_XARRAY_MULTI xas->xa_index = order < BITS_PER_LONG ? (index >> order) << order : 0; xas->xa_shift = order - (order % XA_CHUNK_SHIFT); xas->xa_sibs = (1 << (order % XA_CHUNK_SHIFT)) - 1; xas->xa_node = XAS_RESTART; #else BUG_ON(order > 0); xas_set(xas, index); #endif } /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. * @update: Function to call when updating a node. * * The XArray can notify a caller after it has updated an xa_node. * This is advanced functionality and is only needed by the page cache. */ static inline void xas_set_update(struct xa_state *xas, xa_update_node_t update) { xas->xa_update = update; } /** * xas_next_entry() - Advance iterator to next present entry. * @xas: XArray operation state. * @max: Highest index to return. * * xas_next_entry() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find(), and will call xas_find() * for all the hard cases. * * Return: The next present entry after the one currently referred to by @xas. */ static inline void *xas_next_entry(struct xa_state *xas, unsigned long max) { struct xa_node *node = xas->xa_node; void *entry; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset != (xas->xa_index & XA_CHUNK_MASK))) return xas_find(xas, max); do { if (unlikely(xas->xa_index >= max)) return xas_find(xas, max); if (unlikely(xas->xa_offset == XA_CHUNK_MASK)) return xas_find(xas, max); entry = xa_entry(xas->xa, node, xas->xa_offset + 1); if (unlikely(xa_is_internal(entry))) return xas_find(xas, max); xas->xa_offset++; xas->xa_index++; } while (!entry); return entry; } /* Private */ static inline unsigned int xas_find_chunk(struct xa_state *xas, bool advance, xa_mark_t mark) { unsigned long *addr = xas->xa_node->marks[(__force unsigned)mark]; unsigned int offset = xas->xa_offset; if (advance) offset++; if (XA_CHUNK_SIZE == BITS_PER_LONG) { if (offset < XA_CHUNK_SIZE) { unsigned long data = *addr & (~0UL << offset); if (data) return __ffs(data); } return XA_CHUNK_SIZE; } return find_next_bit(addr, XA_CHUNK_SIZE, offset); } /** * xas_next_marked() - Advance iterator to next marked entry. * @xas: XArray operation state. * @max: Highest index to return. * @mark: Mark to search for. * * xas_next_marked() is an inline function to optimise xarray traversal for * speed. It is equivalent to calling xas_find_marked(), and will call * xas_find_marked() for all the hard cases. * * Return: The next marked entry after the one currently referred to by @xas. */ static inline void *xas_next_marked(struct xa_state *xas, unsigned long max, xa_mark_t mark) { struct xa_node *node = xas->xa_node; void *entry; unsigned int offset; if (unlikely(xas_not_node(node) || node->shift)) return xas_find_marked(xas, max, mark); offset = xas_find_chunk(xas, true, mark); xas->xa_offset = offset; xas->xa_index = (xas->xa_index & ~XA_CHUNK_MASK) + offset; if (xas->xa_index > max) return NULL; if (offset == XA_CHUNK_SIZE) return xas_find_marked(xas, max, mark); entry = xa_entry(xas->xa, node, offset); if (!entry) return xas_find_marked(xas, max, mark); return entry; } /* * If iterating while holding a lock, drop the lock and reschedule * every %XA_CHECK_SCHED loops. */ enum { XA_CHECK_SCHED = 4096, }; /** * xas_for_each() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * * The loop body will be executed for each entry present in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each(xas, entry, max) \ for (entry = xas_find(xas, max); entry; \ entry = xas_next_entry(xas, max)) /** * xas_for_each_marked() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * @max: Maximum index to retrieve from array. * @mark: Mark to search for. * * The loop body will be executed for each marked entry in the xarray * between the current xas position and @max. @entry will be set to * the entry retrieved from the xarray. It is safe to delete entries * from the array in the loop body. You should hold either the RCU lock * or the xa_lock while iterating. If you need to drop the lock, call * xas_pause() first. */ #define xas_for_each_marked(xas, entry, max, mark) \ for (entry = xas_find_marked(xas, max, mark); entry; \ entry = xas_next_marked(xas, max, mark)) /** * xas_for_each_conflict() - Iterate over a range of an XArray. * @xas: XArray operation state. * @entry: Entry retrieved from the array. * * The loop body will be executed for each entry in the XArray that * lies within the range specified by @xas. If the loop terminates * normally, @entry will be %NULL. The user may break out of the loop, * which will leave @entry set to the conflicting entry. The caller * may also call xa_set_err() to exit the loop while setting an error * to record the reason. */ #define xas_for_each_conflict(xas, entry) \ while ((entry = xas_find_conflict(xas))) void *__xas_next(struct xa_state *); void *__xas_prev(struct xa_state *); /** * xas_prev() - Move iterator to previous index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * subtracted from the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index 0, this function wraps * around to %ULONG_MAX. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_prev(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == 0)) return __xas_prev(xas); xas->xa_index--; xas->xa_offset--; return xa_entry(xas->xa, node, xas->xa_offset); } /** * xas_next() - Move state to next index. * @xas: XArray operation state. * * If the @xas was in an error state, it will remain in an error state * and this function will return %NULL. If the @xas has never been walked, * it will have the effect of calling xas_load(). Otherwise one will be * added to the index and the state will be walked to the correct * location in the array for the next operation. * * If the iterator was referencing index %ULONG_MAX, this function wraps * around to 0. * * Return: The entry at the new index. This may be %NULL or an internal * entry. */ static inline void *xas_next(struct xa_state *xas) { struct xa_node *node = xas->xa_node; if (unlikely(xas_not_node(node) || node->shift || xas->xa_offset == XA_CHUNK_MASK)) return __xas_next(xas); xas->xa_index++; xas->xa_offset++; return xa_entry(xas->xa, node, xas->xa_offset); } #endif /* _LINUX_XARRAY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IPV6_H #define _NET_IPV6_H #include <linux/ipv6.h> #include <linux/hardirq.h> #include <linux/jhash.h> #include <linux/refcount.h> #include <linux/jump_label_ratelimit.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/snmp.h> #include <net/netns/hash.h> #define SIN6_LEN_RFC2133 24 #define IPV6_MAXPLEN 65535 /* * NextHeader field of IPv6 header */ #define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ #define NEXTHDR_TCP 6 /* TCP segment. */ #define NEXTHDR_UDP 17 /* UDP message. */ #define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ #define NEXTHDR_ROUTING 43 /* Routing header. */ #define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ #define NEXTHDR_GRE 47 /* GRE header. */ #define NEXTHDR_ESP 50 /* Encapsulating security payload. */ #define NEXTHDR_AUTH 51 /* Authentication header. */ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ #define NEXTHDR_SCTP 132 /* SCTP message. */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 /* Limits on Hop-by-Hop and Destination options. * * Per RFC8200 there is no limit on the maximum number or lengths of options in * Hop-by-Hop or Destination options other then the packet must fit in an MTU. * We allow configurable limits in order to mitigate potential denial of * service attacks. * * There are three limits that may be set: * - Limit the number of options in a Hop-by-Hop or Destination options * extension header * - Limit the byte length of a Hop-by-Hop or Destination options extension * header * - Disallow unknown options * * The limits are expressed in corresponding sysctls: * * ipv6.sysctl.max_dst_opts_cnt * ipv6.sysctl.max_hbh_opts_cnt * ipv6.sysctl.max_dst_opts_len * ipv6.sysctl.max_hbh_opts_len * * max_*_opts_cnt is the number of TLVs that are allowed for Destination * options or Hop-by-Hop options. If the number is less than zero then unknown * TLVs are disallowed and the number of known options that are allowed is the * absolute value. Setting the value to INT_MAX indicates no limit. * * max_*_opts_len is the length limit in bytes of a Destination or * Hop-by-Hop options extension header. Setting the value to INT_MAX * indicates no length limit. * * If a limit is exceeded when processing an extension header the packet is * silently discarded. */ /* Default limits for Hop-by-Hop and Destination options */ #define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 #define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ /* * Addr type * * type - unicast | multicast * scope - local | site | global * v4 - compat * v4mapped * any * loopback */ #define IPV6_ADDR_ANY 0x0000U #define IPV6_ADDR_UNICAST 0x0001U #define IPV6_ADDR_MULTICAST 0x0002U #define IPV6_ADDR_LOOPBACK 0x0010U #define IPV6_ADDR_LINKLOCAL 0x0020U #define IPV6_ADDR_SITELOCAL 0x0040U #define IPV6_ADDR_COMPATv4 0x0080U #define IPV6_ADDR_SCOPE_MASK 0x00f0U #define IPV6_ADDR_MAPPED 0x1000U /* * Addr scopes */ #define IPV6_ADDR_MC_SCOPE(a) \ ((a)->s6_addr[1] & 0x0f) /* nonstandard */ #define __IPV6_ADDR_SCOPE_INVALID -1 #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 #define IPV6_ADDR_SCOPE_GLOBAL 0x0e /* * Addr flags */ #define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ ((a)->s6_addr[1] & 0x10) #define IPV6_ADDR_MC_FLAG_PREFIX(a) \ ((a)->s6_addr[1] & 0x20) #define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ ((a)->s6_addr[1] & 0x40) /* * fragmentation header */ struct frag_hdr { __u8 nexthdr; __u8 reserved; __be16 frag_off; __be32 identification; }; #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 struct ip6_fraglist_iter { struct ipv6hdr *tmp_hdr; struct sk_buff *frag; int offset; unsigned int hlen; __be32 frag_id; u8 nexthdr; }; int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter); static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip6_frag_state { u8 *prevhdr; unsigned int hlen; unsigned int mtu; unsigned int left; int offset; int ptr; int hroom; int troom; __be32 frag_id; u8 nexthdr; }; void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state); #define IP6_REPLY_MARK(net, mark) \ ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0) #include <net/sock.h> /* sysctls */ extern int sysctl_mld_max_msf; extern int sysctl_mld_qrv; #define _DEVINC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\ mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\ }) /* per device counters are atomic_long_t */ #define _DEVINCATOMIC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\ }) /* per device and per net counters are atomic_long_t */ #define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\ }) #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ }) /* MIBs */ #define IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, , idev, field) #define __IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, __, idev, field) #define IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, , idev, field, val) #define __IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, __, idev, field, val) #define IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, , idev, field, val) #define __IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, __, idev, field, val) #define ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, , idev, field) #define __ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, __, idev, field) #define ICMP6MSGOUT_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256) #define ICMP6MSGIN_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field) struct ip6_ra_chain { struct ip6_ra_chain *next; struct sock *sk; int sel; void (*destructor)(struct sock *); }; extern struct ip6_ra_chain *ip6_ra_chain; extern rwlock_t ip6_ra_lock; /* This structure is prepared by protocol, when parsing ancillary data and passed to IPv6. */ struct ipv6_txoptions { refcount_t refcnt; /* Length of this structure */ int tot_len; /* length of extension headers */ __u16 opt_flen; /* after fragment hdr */ __u16 opt_nflen; /* before fragment hdr */ struct ipv6_opt_hdr *hopopt; struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; /* flowlabel_reflect sysctl values */ enum flowlabel_reflect { FLOWLABEL_REFLECT_ESTABLISHED = 1, FLOWLABEL_REFLECT_TCP_RESET = 2, FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES = 4, }; struct ip6_flowlabel { struct ip6_flowlabel __rcu *next; __be32 label; atomic_t users; struct in6_addr dst; struct ipv6_txoptions *opt; unsigned long linger; struct rcu_head rcu; u8 share; union { struct pid *pid; kuid_t uid; } owner; unsigned long lastuse; unsigned long expires; struct net *fl_net; }; #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) #define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF) #define IPV6_FLOWLABEL_STATELESS_FLAG cpu_to_be32(0x00080000) #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) #define IPV6_TCLASS_SHIFT 20 struct ipv6_fl_socklist { struct ipv6_fl_socklist __rcu *next; struct ip6_flowlabel *fl; struct rcu_head rcu; }; struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; __s8 dontfrag; struct ipv6_txoptions *opt; __u16 gso_size; }; static inline void ipcm6_init(struct ipcm6_cookie *ipc6) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = -1, .dontfrag = -1, }; } static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct ipv6_pinfo *np) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = np->tclass, .dontfrag = np->dontfrag, }; } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) { struct ipv6_txoptions *opt; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { if (!refcount_inc_not_zero(&opt->refcnt)) opt = NULL; else opt = rcu_pointer_handoff(opt); } rcu_read_unlock(); return opt; } static inline void txopt_put(struct ipv6_txoptions *opt) { if (opt && refcount_dec_and_test(&opt->refcnt)) kfree_rcu(opt, rcu); } struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt); void fl6_free_socklist(struct sock *sk); int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen); int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { if (fl) atomic_dec(&fl->users); } void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); int ipv6_parse_hopopts(struct sk_buff *skb); struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt); struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 : idev->cnf.accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ int __ipv6_addr_type(const struct in6_addr *addr); static inline int ipv6_addr_type(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & 0xffff; } static inline int ipv6_addr_scope(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; } static inline int __ipv6_addr_src_scope(int type) { return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); } static inline int ipv6_addr_src_scope(const struct in6_addr *addr) { return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } static inline bool __ipv6_addr_needs_scope_id(int type) { return type & IPV6_ADDR_LINKLOCAL || (type & IPV6_ADDR_MULTICAST && (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); } static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) { return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; } static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) { return memcmp(a1, a2, sizeof(struct in6_addr)); } static inline bool ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ulm = (const unsigned long *)m; const unsigned long *ul2 = (const unsigned long *)a2; return !!(((ul1[0] ^ ul2[0]) & ulm[0]) | ((ul1[1] ^ ul2[1]) & ulm[1])); #else return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); #endif } static inline void ipv6_addr_prefix(struct in6_addr *pfx, const struct in6_addr *addr, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); memcpy(pfx->s6_addr, addr, o); if (b != 0) pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, const struct in6_addr *pfx, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memcpy(addr->s6_addr, pfx, o); if (b != 0) { addr->s6_addr[o] &= ~(0xff00 >> b); addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); } } static inline void __ipv6_addr_set_half(__be32 *addr, __be32 wh, __be32 wl) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #if defined(__BIG_ENDIAN) if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) { *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl)); return; } #elif defined(__LITTLE_ENDIAN) if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) { *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh)); return; } #endif #endif addr[0] = wh; addr[1] = wl; } static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4) { __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2); __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4); } static inline bool ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; #endif } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline bool __ipv6_prefix_equal64_half(const __be64 *a1, const __be64 *a2, unsigned int len) { if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len)))) return false; return true; } static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be64 *a1 = (const __be64 *)addr1; const __be64 *a2 = (const __be64 *)addr2; if (prefixlen >= 64) { if (a1[0] ^ a2[0]) return false; return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64); } return __ipv6_prefix_equal64_half(a1, a2, prefixlen); } #else static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be32 *a1 = addr1->s6_addr32; const __be32 *a2 = addr2->s6_addr32; unsigned int pdw, pbi; /* check complete u32 in prefix */ pdw = prefixlen >> 5; if (pdw && memcmp(a1, a2, pdw << 2)) return false; /* check incomplete u32 in prefix */ pbi = prefixlen & 0x1f; if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) return false; return true; } #endif static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; return (ul[0] | ul[1]) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]) == 0; #endif } static inline u32 ipv6_addr_hash(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; unsigned long x = ul[0] ^ ul[1]; return (u32)(x ^ (x >> 32)); #else return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^ a->s6_addr32[2] ^ a->s6_addr32[3]); #endif } /* more secured version of ipv6_addr_hash() */ static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; return jhash_3words(v, (__force u32)a->s6_addr32[2], (__force u32)a->s6_addr32[3], initval); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const __be64 *be = (const __be64 *)a; return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0; #endif } /* * Note that we must __force cast these to unsigned long to make sparse happy, * since all of the endian-annotated types are fixed size regardless of arch. */ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) { return ( #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 *(unsigned long *)a | #else (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | #endif (__force unsigned long)(a->s6_addr32[2] ^ cpu_to_be32(0x0000ffff))) == 0UL; } static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); } static inline u32 ipv6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) { unsigned int hash, mix = net_hash_mix(net); if (ipv6_addr_any(addr6)) hash = jhash_1word(0, mix); else if (ipv6_addr_v4mapped(addr6)) hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); else hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); return hash ^ port; } /* * Check for a RFC 4843 ORCHID address * (Overlay Routable Cryptographic Hash Identifiers) */ static inline bool ipv6_addr_orchid(const struct in6_addr *a) { return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); } static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr) { return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); } static inline void ipv6_addr_set_v4mapped(const __be32 addr, struct in6_addr *v4mapped) { ipv6_addr_set(v4mapped, 0, 0, htonl(0x0000FFFF), addr); } /* * find the first different bit between two addresses * length of address must be a multiple of 32bits */ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen) { const __be32 *a1 = token1, *a2 = token2; int i; addrlen >>= 2; for (i = 0; i < addrlen; i++) { __be32 xb = a1[i] ^ a2[i]; if (xb) return i * 32 + 31 - __fls(ntohl(xb)); } /* * we should *never* get to this point since that * would mean the addrs are equal * * However, we do get to it 8) And exacly, when * addresses are equal 8) * * ip route add 1111::/128 via ... * ip route add 1111::/64 via ... * and we are here. * * Ideally, this function should stop comparison * at prefix length. It does not, but it is still OK, * if returned value is greater than prefix length. * --ANK (980803) */ return addrlen << 5; } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen) { const __be64 *a1 = token1, *a2 = token2; int i; addrlen >>= 3; for (i = 0; i < addrlen; i++) { __be64 xb = a1[i] ^ a2[i]; if (xb) return i * 64 + 63 - __fls(be64_to_cpu(xb)); } return addrlen << 6; } #endif static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 if (__builtin_constant_p(addrlen) && !(addrlen & 7)) return __ipv6_addr_diff64(token1, token2, addrlen); #endif return __ipv6_addr_diff32(token1, token2, addrlen); } static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) { return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, struct dst_entry *dst) { int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) hlimit = np->mcast_hops; else hlimit = np->hop_limit; if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; } /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, const struct ipv6hdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) != offsetof(typeof(flow->addrs), v6addrs.src) + sizeof(flow->addrs.v6addrs.src)); memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } #if IS_ENABLED(CONFIG_IPV6) static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || inet->freebind || inet->transparent; } /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF 0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 #define IP6_AUTO_FLOW_LABEL_OPTIN 2 #define IP6_AUTO_FLOW_LABEL_FORCED 3 #define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED #define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { u32 hash; /* @flowlabel may include more than a flow label, eg, the traffic class. * Here we want only the flow label value. */ flowlabel &= IPV6_FLOWLABEL_MASK; if (flowlabel || net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || (!autolabel && net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) return flowlabel; hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; if (net->ipv6.sysctl.flowlabel_state_ranges) flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { switch (net->ipv6.sysctl.auto_flowlabels) { case IP6_AUTO_FLOW_LABEL_OFF: case IP6_AUTO_FLOW_LABEL_OPTIN: default: return 0; case IP6_AUTO_FLOW_LABEL_OPTOUT: case IP6_AUTO_FLOW_LABEL_FORCED: return 1; } } #else static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { return 0; } #endif #if IS_ENABLED(CONFIG_IPV6) static inline int ip6_multipath_hash_policy(const struct net *net) { return net->ipv6.sysctl.multipath_hash_policy; } #else static inline int ip6_multipath_hash_policy(const struct net *net) { return 0; } #endif /* * Header manipulation */ static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, __be32 flowlabel) { *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; } static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWINFO_MASK; } static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; } static inline u8 ip6_tclass(__be32 flowinfo) { return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) { return fl6->flowlabel & IPV6_FLOWLABEL_MASK; } /* * Prototypes exported by ipv6 */ /* * rcv function (called from netdevice level) */ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); /* * upper-layer output functions */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); int ip6_push_pending_frames(struct sock *sk); void ip6_flush_pending_frames(struct sock *sk); int ip6_send_skb(struct sk_buff *skb); struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, struct in6_addr *saddr, const struct ip_tunnel_info *info, u8 protocol, bool use_cache); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); /* * skb processing functions */ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_forward(struct sk_buff *skb); int ip6_input(struct sk_buff *skb); int ip6_mc_input(struct sk_buff *skb); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final); int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); /* * Extension header (options) processing */ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr_p, struct in6_addr *saddr); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto); int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, __be16 *frag_offp); bool ipv6_ext_hdr(u8 nexthdr); enum { IP6_FH_F_FRAG = (1 << 0), IP6_FH_F_AUTH = (1 << 1), IP6_FH_F_SKIP_RH = (1 << 2), }; /* find specified header and get offset to it */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *fragflg); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type); struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig); /* * socket options (ipv6_sockglue.c) */ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); /* * reassembly.c */ extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; extern const struct proto_ops inet6_sockraw_ops; struct group_source_req; struct group_filter; int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage __user *p); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); void ac6_proc_exit(struct net *net); int raw6_proc_init(void); void raw6_proc_exit(void); int tcp6_proc_init(struct net *net); void tcp6_proc_exit(struct net *net); int udp6_proc_init(struct net *net); void udp6_proc_exit(struct net *net); int udplite6_proc_init(void); void udplite6_proc_exit(void); int ipv6_misc_proc_init(void); void ipv6_misc_proc_exit(void); int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); #else static inline int ac6_proc_init(struct net *net) { return 0; } static inline void ac6_proc_exit(struct net *net) { } static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; } static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; } #endif #ifdef CONFIG_SYSCTL struct ctl_table *ipv6_icmp_sysctl_init(struct net *net); struct ctl_table *ipv6_route_sysctl_init(struct net *net); int ipv6_sysctl_register(void); void ipv6_sysctl_unregister(void); #endif int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); static inline int ip6_sock_set_v6only(struct sock *sk) { if (inet_sk(sk)->inet_num) return -EINVAL; lock_sock(sk); sk->sk_ipv6only = true; release_sock(sk); return 0; } static inline void ip6_sock_set_recverr(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->recverr = true; release_sock(sk); } static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) { unsigned int pref = 0; unsigned int prefmask = ~0; /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ switch (val & (IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { case IPV6_PREFER_SRC_PUBLIC: pref |= IPV6_PREFER_SRC_PUBLIC; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_TMP: pref |= IPV6_PREFER_SRC_TMP; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_PUBTMP_DEFAULT: prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case 0: break; default: return -EINVAL; } /* check HOME/COA conflicts */ switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { case IPV6_PREFER_SRC_HOME: prefmask &= ~IPV6_PREFER_SRC_COA; break; case IPV6_PREFER_SRC_COA: pref |= IPV6_PREFER_SRC_COA; break; case 0: break; default: return -EINVAL; } /* check CGA/NONCGA conflicts */ switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { case IPV6_PREFER_SRC_CGA: case IPV6_PREFER_SRC_NONCGA: case 0: break; default: return -EINVAL; } inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; return 0; } static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) { int ret; lock_sock(sk); ret = __ip6_sock_set_addr_preferences(sk, val); release_sock(sk); return ret; } static inline void ip6_sock_set_recvpktinfo(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->rxopt.bits.rxinfo = true; release_sock(sk); } #endif /* _NET_IPV6_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_DCACHE_H #define __LINUX_DCACHE_H #include <linux/atomic.h> #include <linux/list.h> #include <linux/rculist.h> #include <linux/rculist_bl.h> #include <linux/spinlock.h> #include <linux/seqlock.h> #include <linux/cache.h> #include <linux/rcupdate.h> #include <linux/lockref.h> #include <linux/stringhash.h> #include <linux/wait.h> struct path; struct vfsmount; /* * linux/include/linux/dcache.h * * Dirent cache data structures * * (C) Copyright 1997 Thomas Schoebel-Theuer, * with heavy changes by Linus Torvalds */ #define IS_ROOT(x) ((x) == (x)->d_parent) /* The hash is always the low bits of hash_len */ #ifdef __LITTLE_ENDIAN #define HASH_LEN_DECLARE u32 hash; u32 len #define bytemask_from_count(cnt) (~(~0ul << (cnt)*8)) #else #define HASH_LEN_DECLARE u32 len; u32 hash #define bytemask_from_count(cnt) (~(~0ul >> (cnt)*8)) #endif /* * "quick string" -- eases parameter passing, but more importantly * saves "metadata" about the string (ie length and the hash). * * hash comes first so it snuggles against d_parent in the * dentry. */ struct qstr { union { struct { HASH_LEN_DECLARE; }; u64 hash_len; }; const unsigned char *name; }; #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } extern const struct qstr empty_name; extern const struct qstr slash_name; struct dentry_stat_t { long nr_dentry; long nr_unused; long age_limit; /* age in seconds */ long want_pages; /* pages requested by system */ long nr_negative; /* # of unused negative dentries */ long dummy; /* Reserved for future use */ }; extern struct dentry_stat_t dentry_stat; /* * Try to keep struct dentry aligned on 64 byte cachelines (this will * give reasonable cacheline footprint with larger lines without the * large memory footprint increase). */ #ifdef CONFIG_64BIT # define DNAME_INLINE_LEN 32 /* 192 bytes */ #else # ifdef CONFIG_SMP # define DNAME_INLINE_LEN 36 /* 128 bytes */ # else # define DNAME_INLINE_LEN 40 /* 128 bytes */ # endif #endif #define d_lock d_lockref.lock struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_spinlock_t d_seq; /* per dentry seqlock */ struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ /* Ref lookup also touches following */ struct lockref d_lockref; /* per-dentry lock and refcount */ const struct dentry_operations *d_op; struct super_block *d_sb; /* The root of the dentry tree */ unsigned long d_time; /* used by d_revalidate */ void *d_fsdata; /* fs-specific data */ union { struct list_head d_lru; /* LRU list */ wait_queue_head_t *d_wait; /* in-lookup ones only */ }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ /* * d_alias and d_rcu can share memory */ union { struct hlist_node d_alias; /* inode alias list */ struct hlist_bl_node d_in_lookup_hash; /* only for in-lookup ones */ struct rcu_head d_rcu; } d_u; } __randomize_layout; /* * dentry->d_lock spinlock nesting subclasses: * * 0: normal * 1: nested */ enum dentry_d_lock_class { DENTRY_D_LOCK_NORMAL, /* implicitly used by plain spin_lock() APIs. */ DENTRY_D_LOCK_NESTED }; struct dentry_operations { int (*d_revalidate)(struct dentry *, unsigned int); int (*d_weak_revalidate)(struct dentry *, unsigned int); int (*d_hash)(const struct dentry *, struct qstr *); int (*d_compare)(const struct dentry *, unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); int (*d_init)(struct dentry *); void (*d_release)(struct dentry *); void (*d_prune)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); struct vfsmount *(*d_automount)(struct path *); int (*d_manage)(const struct path *, bool); struct dentry *(*d_real)(struct dentry *, const struct inode *); } ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/locking.rst. Keep it updated! * * FUrther descriptions are found in Documentation/filesystems/vfs.rst. * Keep it updated too! */ /* d_flags entries */ #define DCACHE_OP_HASH 0x00000001 #define DCACHE_OP_COMPARE 0x00000002 #define DCACHE_OP_REVALIDATE 0x00000004 #define DCACHE_OP_DELETE 0x00000008 #define DCACHE_OP_PRUNE 0x00000010 #define DCACHE_DISCONNECTED 0x00000020 /* This dentry is possibly not currently connected to the dcache tree, in * which case its parent will either be itself, or will have this flag as * well. nfsd will not use a dentry with this bit set, but will first * endeavour to clear the bit either by discovering that it is connected, * or by performing lookup operations. Any filesystem which supports * nfsd_operations MUST have a lookup function which, if it finds a * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that * dentry into place and return that dentry rather than the passed one, * typically using d_splice_alias. */ #define DCACHE_REFERENCED 0x00000040 /* Recently used, don't discard. */ #define DCACHE_DONTCACHE 0x00000080 /* Purge from memory on final dput() */ #define DCACHE_CANT_MOUNT 0x00000100 #define DCACHE_GENOCIDE 0x00000200 #define DCACHE_SHRINK_LIST 0x00000400 #define DCACHE_OP_WEAK_REVALIDATE 0x00000800 #define DCACHE_NFSFS_RENAMED 0x00001000 /* this dentry has been "silly renamed" and has to be deleted on the last * dput() */ #define DCACHE_COOKIE 0x00002000 /* For use by dcookie subsystem */ #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x00004000 /* Parent inode is watched by some fsnotify listener */ #define DCACHE_DENTRY_KILLED 0x00008000 #define DCACHE_MOUNTED 0x00010000 /* is a mountpoint */ #define DCACHE_NEED_AUTOMOUNT 0x00020000 /* handle automount on this dir */ #define DCACHE_MANAGE_TRANSIT 0x00040000 /* manage transit from this dirent */ #define DCACHE_MANAGED_DENTRY \ (DCACHE_MOUNTED|DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT) #define DCACHE_LRU_LIST 0x00080000 #define DCACHE_ENTRY_TYPE 0x00700000 #define DCACHE_MISS_TYPE 0x00000000 /* Negative dentry (maybe fallthru to nowhere) */ #define DCACHE_WHITEOUT_TYPE 0x00100000 /* Whiteout dentry (stop pathwalk) */ #define DCACHE_DIRECTORY_TYPE 0x00200000 /* Normal directory */ #define DCACHE_AUTODIR_TYPE 0x00300000 /* Lookupless directory (presumed automount) */ #define DCACHE_REGULAR_TYPE 0x00400000 /* Regular file type (or fallthru to such) */ #define DCACHE_SPECIAL_TYPE 0x00500000 /* Other file type (or fallthru to such) */ #define DCACHE_SYMLINK_TYPE 0x00600000 /* Symlink (or fallthru to such) */ #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ #define DCACHE_NOKEY_NAME 0x02000000 /* Encrypted name encoded without key */ #define DCACHE_OP_REAL 0x04000000 #define DCACHE_PAR_LOOKUP 0x10000000 /* being looked up (with parent locked shared) */ #define DCACHE_DENTRY_CURSOR 0x20000000 #define DCACHE_NORCU 0x40000000 /* No RCU delay for freeing */ extern seqlock_t rename_lock; /* * These are the low-level FS interfaces to the dcache.. */ extern void d_instantiate(struct dentry *, struct inode *); extern void d_instantiate_new(struct dentry *, struct inode *); extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); extern struct dentry * d_instantiate_anon(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, wait_queue_head_t *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); extern struct dentry *d_find_any_alias(struct inode *inode); extern struct dentry * d_obtain_alias(struct inode *); extern struct dentry * d_obtain_root(struct inode *); extern void shrink_dcache_sb(struct super_block *); extern void shrink_dcache_parent(struct dentry *); extern void shrink_dcache_for_umount(struct super_block *); extern void d_invalidate(struct dentry *); /* only used at mount-time */ extern struct dentry * d_make_root(struct inode *); /* <clickety>-<click> the ramfs-type tree */ extern void d_genocide(struct dentry *); extern void d_tmpfile(struct dentry *, struct inode *); extern struct dentry *d_find_alias(struct inode *); extern void d_prune_aliases(struct inode *); /* test whether we have any submounts in a subdir tree */ extern int path_has_submounts(const struct path *); /* * This adds the entry to the hash queues. */ extern void d_rehash(struct dentry *); extern void d_add(struct dentry *, struct inode *); /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern void d_exchange(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ extern struct dentry *d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); static inline unsigned d_count(const struct dentry *dentry) { return dentry->d_lockref.count; } /* * helper function for dentry_operations.d_dname() members */ extern __printf(4, 5) char *dynamic_dname(struct dentry *, char *, int, const char *, ...); extern char *__d_path(const struct path *, const struct path *, char *, int); extern char *d_absolute_path(const struct path *, char *, int); extern char *d_path(const struct path *, char *, int); extern char *dentry_path_raw(struct dentry *, char *, int); extern char *dentry_path(struct dentry *, char *, int); /* Allocation counts.. */ /** * dget, dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be * destroyed when it has references. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { if (dentry) dentry->d_lockref.count++; return dentry; } static inline struct dentry *dget(struct dentry *dentry) { if (dentry) lockref_get(&dentry->d_lockref); return dentry; } extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed * @dentry: entry to check * * Returns true if the dentry passed is not currently hashed. */ static inline int d_unhashed(const struct dentry *dentry) { return hlist_bl_unhashed(&dentry->d_hash); } static inline int d_unlinked(const struct dentry *dentry) { return d_unhashed(dentry) && !IS_ROOT(dentry); } static inline int cant_mount(const struct dentry *dentry) { return (dentry->d_flags & DCACHE_CANT_MOUNT); } static inline void dont_mount(struct dentry *dentry) { spin_lock(&dentry->d_lock); dentry->d_flags |= DCACHE_CANT_MOUNT; spin_unlock(&dentry->d_lock); } extern void __d_lookup_done(struct dentry *); static inline int d_in_lookup(const struct dentry *dentry) { return dentry->d_flags & DCACHE_PAR_LOOKUP; } static inline void d_lookup_done(struct dentry *dentry) { if (unlikely(d_in_lookup(dentry))) { spin_lock(&dentry->d_lock); __d_lookup_done(dentry); spin_unlock(&dentry->d_lock); } } extern void dput(struct dentry *); static inline bool d_managed(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MANAGED_DENTRY; } static inline bool d_mountpoint(const struct dentry *dentry) { return dentry->d_flags & DCACHE_MOUNTED; } /* * Directory cache entry type accessor functions. */ static inline unsigned __d_entry_type(const struct dentry *dentry) { return dentry->d_flags & DCACHE_ENTRY_TYPE; } static inline bool d_is_miss(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_MISS_TYPE; } static inline bool d_is_whiteout(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_WHITEOUT_TYPE; } static inline bool d_can_lookup(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_DIRECTORY_TYPE; } static inline bool d_is_autodir(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_AUTODIR_TYPE; } static inline bool d_is_dir(const struct dentry *dentry) { return d_can_lookup(dentry) || d_is_autodir(dentry); } static inline bool d_is_symlink(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SYMLINK_TYPE; } static inline bool d_is_reg(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_REGULAR_TYPE; } static inline bool d_is_special(const struct dentry *dentry) { return __d_entry_type(dentry) == DCACHE_SPECIAL_TYPE; } static inline bool d_is_file(const struct dentry *dentry) { return d_is_reg(dentry) || d_is_special(dentry); } static inline bool d_is_negative(const struct dentry *dentry) { // TODO: check d_is_whiteout(dentry) also. return d_is_miss(dentry); } static inline bool d_flags_negative(unsigned flags) { return (flags & DCACHE_ENTRY_TYPE) == DCACHE_MISS_TYPE; } static inline bool d_is_positive(const struct dentry *dentry) { return !d_is_negative(dentry); } /** * d_really_is_negative - Determine if a dentry is really negative (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents either an absent name or a name that * doesn't map to an inode (ie. ->d_inode is NULL). The dentry could represent * a true miss, a whiteout that isn't represented by a 0,0 chardev or a * fallthrough marker in an opaque directory. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. (3) The dentry may have something attached to ->d_lower and the * type field of the flags may be set to something other than miss or whiteout. */ static inline bool d_really_is_negative(const struct dentry *dentry) { return dentry->d_inode == NULL; } /** * d_really_is_positive - Determine if a dentry is really positive (ignoring fallthroughs) * @dentry: The dentry in question * * Returns true if the dentry represents a name that maps to an inode * (ie. ->d_inode is not NULL). The dentry might still represent a whiteout if * that is represented on medium as a 0,0 chardev. * * Note! (1) This should be used *only* by a filesystem to examine its own * dentries. It should not be used to look at some other filesystem's * dentries. (2) It should also be used in combination with d_inode() to get * the inode. */ static inline bool d_really_is_positive(const struct dentry *dentry) { return dentry->d_inode != NULL; } static inline int simple_positive(const struct dentry *dentry) { return d_really_is_positive(dentry) && !d_unhashed(dentry); } extern void d_set_fallthru(struct dentry *dentry); static inline bool d_is_fallthru(const struct dentry *dentry) { return dentry->d_flags & DCACHE_FALLTHRU; } extern int sysctl_vfs_cache_pressure; static inline unsigned long vfs_pressure_ratio(unsigned long val) { return mult_frac(val, sysctl_vfs_cache_pressure, 100); } /** * d_inode - Get the actual inode of this dentry * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode(const struct dentry *dentry) { return dentry->d_inode; } /** * d_inode_rcu - Get the actual inode of this dentry with READ_ONCE() * @dentry: The dentry to query * * This is the helper normal filesystems should use to get at their own inodes * in their own dentries and ignore the layering superimposed upon them. */ static inline struct inode *d_inode_rcu(const struct dentry *dentry) { return READ_ONCE(dentry->d_inode); } /** * d_backing_inode - Get upper or lower inode we should be using * @upper: The upper layer * * This is the helper that should be used to get at the inode that will be used * if this dentry were to be opened as a file. The inode may be on the upper * dentry or it may be on a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own inodes. */ static inline struct inode *d_backing_inode(const struct dentry *upper) { struct inode *inode = upper->d_inode; return inode; } /** * d_backing_dentry - Get upper or lower dentry we should be using * @upper: The upper layer * * This is the helper that should be used to get the dentry of the inode that * will be used if this dentry were opened as a file. It may be the upper * dentry or it may be a lower dentry pinned by the upper. * * Normal filesystems should not use this to access their own dentries. */ static inline struct dentry *d_backing_dentry(struct dentry *upper) { return upper; } /** * d_real - Return the real dentry * @dentry: the dentry to query * @inode: inode to select the dentry from multiple layers (can be NULL) * * If dentry is on a union/overlay, then return the underlying, real dentry. * Otherwise return the dentry itself. * * See also: Documentation/filesystems/vfs.rst */ static inline struct dentry *d_real(struct dentry *dentry, const struct inode *inode) { if (unlikely(dentry->d_flags & DCACHE_OP_REAL)) return dentry->d_op->d_real(dentry, inode); else return dentry; } /** * d_real_inode - Return the real inode * @dentry: The dentry to query * * If dentry is on a union/overlay, then return the underlying, real inode. * Otherwise return d_inode(). */ static inline struct inode *d_real_inode(const struct dentry *dentry) { /* This usage of d_real() results in const dentry */ return d_backing_inode(d_real((struct dentry *) dentry, NULL)); } struct name_snapshot { struct qstr name; unsigned char inline_name[DNAME_INLINE_LEN]; }; void take_dentry_name_snapshot(struct name_snapshot *, struct dentry *); void release_dentry_name_snapshot(struct name_snapshot *); #endif /* __LINUX_DCACHE_H */
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_H #define _ASM_X86_UACCESS_H /* * User space memory access functions */ #include <linux/compiler.h> #include <linux/kasan-checks.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> /* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) { /* * If we have used "sizeof()" for the size, * we know it won't overflow the limit (but * it might overflow the 'addr', so it's * important to subtract the size from the * limit, not add it to the address). */ if (__builtin_constant_p(size)) return unlikely(addr > limit - size); /* Arbitrary sizes? Be careful about overflow */ addr += size; if (unlikely(addr < size)) return true; return unlikely(addr > limit); } #define __range_not_ok(addr, size, limit) \ ({ \ __chk_user_ptr(addr); \ __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline bool pagefault_disabled(void); # define WARN_ON_IN_IRQ() \ WARN_ON_ONCE(!in_task() && !pagefault_disabled()) #else # define WARN_ON_IN_IRQ() #endif /** * access_ok - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check * @size: Size of block to check * * Context: User context only. This function may sleep if pagefaults are * enabled. * * Checks if a pointer to a block of memory in user space is valid. * * Note that, depending on architecture, this function probably just * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. * * Return: true (nonzero) if the memory block may be valid, false (zero) * if it is definitely invalid. */ #define access_ok(addr, size) \ ({ \ WARN_ON_IN_IRQ(); \ likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \ }) extern int __get_user_1(void); extern int __get_user_2(void); extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_nocheck_1(void); extern int __get_user_nocheck_2(void); extern int __get_user_nocheck_4(void); extern int __get_user_nocheck_8(void); extern int __get_user_bad(void); #define __uaccess_begin() stac() #define __uaccess_end() clac() #define __uaccess_begin_nospec() \ ({ \ stac(); \ barrier_nospec(); \ }) /* * This is the smallest unsigned integer type that can fit a value * (up to 'long long') */ #define __inttype(x) __typeof__( \ __typefits(x,char, \ __typefits(x,short, \ __typefits(x,int, \ __typefits(x,long,0ULL))))) #define __typefits(x,type,not) \ __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not) /* * This is used for both get_user() and __get_user() to expand to * the proper special function call that has odd calling conventions * due to returning both a value and an error, and that depends on * the size of the pointer passed in. * * Careful: we have to cast the result to the type of the pointer * for sign reasons. * * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. * * Clang/LLVM cares about the size of the register, but still wants * the base register for something that ends up being a pair. */ #define do_get_user_call(fn,x,ptr) \ ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ asm volatile("call __" #fn "_%P4" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) /** * get_user - Get a simple variable from user space. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); }) /** * __get_user - Get a simple variable from user space, with less checking. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr) #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ asm_volatile_goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ _ASM_EXTABLE_UA(2b, %l2) \ : : "A" (x), "r" (addr) \ : : label) #else #define __put_user_goto_u64(x, ptr, label) \ __put_user_goto(x, ptr, "q", "er", label) #endif extern void __put_user_bad(void); /* * Strange magic calling convention: pointer in %ecx, * value in %eax(:%edx), return value in %ecx. clobbers %rbx */ extern void __put_user_1(void); extern void __put_user_2(void); extern void __put_user_4(void); extern void __put_user_8(void); extern void __put_user_nocheck_1(void); extern void __put_user_nocheck_2(void); extern void __put_user_nocheck_4(void); extern void __put_user_nocheck_8(void); /* * ptr must be evaluated and assigned to the temporary __ptr_pu before * the assignment of x to __val_pu, to avoid any function calls * involved in the ptr expression (possibly implicitly generated due * to KASAN) from clobbering %ax. */ #define do_put_user_call(fn,x,ptr) \ ({ \ int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ __chk_user_ptr(ptr); \ __ptr_pu = (ptr); \ __val_pu = (x); \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ : "0" (__ptr_pu), \ "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ __builtin_expect(__ret_pu, 0); \ }) /** * put_user - Write a simple value into user space. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Return: zero on success, or -EFAULT on error. */ #define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); }) /** * __put_user - Write a simple value into user space, with less checking. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. */ #define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr) #define __put_user_size(x, ptr, size, label) \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ __put_user_goto(x, ptr, "b", "iq", label); \ break; \ case 2: \ __put_user_goto(x, ptr, "w", "ir", label); \ break; \ case 4: \ __put_user_goto(x, ptr, "l", "ir", label); \ break; \ case 8: \ __put_user_goto_u64(x, ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, label) do { \ unsigned int __gu_low, __gu_high; \ const unsigned int __user *__gu_ptr; \ __gu_ptr = (const void __user *)(ptr); \ __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \ __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \ (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \ } while (0) #else #define __get_user_asm_u64(x, ptr, label) \ __get_user_asm(x, ptr, "q", "=r", label) #endif #define __get_user_size(x, ptr, size, label) \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ unsigned char x_u8__; \ case 1: \ __get_user_asm(x_u8__, ptr, "b", "=q", label); \ (x) = x_u8__; \ break; \ case 2: \ __get_user_asm(x, ptr, "w", "=r", label); \ break; \ case 4: \ __get_user_asm(x, ptr, "l", "=r", label); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, label); \ break; \ default: \ (x) = __get_user_bad(); \ } \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ : [umem] "m" (__m(addr)) \ : : label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ asm volatile("\n" \ "1: movl %[lowbits],%%eax\n" \ "2: movl %[highbits],%%edx\n" \ "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: mov %[efault],%[errout]\n" \ " xorl %%eax,%%eax\n" \ " xorl %%edx,%%edx\n" \ " jmp 3b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 4b) \ _ASM_EXTABLE_UA(2b, 4b) \ : [errout] "=r" (retval), \ [output] "=&A"(x) \ : [lowbits] "m" (__m(__ptr)), \ [highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \ [efault] "i" (-EFAULT), "0" (retval)); \ }) #else #define __get_user_asm_u64(x, ptr, retval) \ __get_user_asm(x, ptr, retval, "q", "=r") #endif #define __get_user_size(x, ptr, size, retval) \ do { \ unsigned char x_u8__; \ \ retval = 0; \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \ (x) = x_u8__; \ break; \ case 2: \ __get_user_asm(x, ptr, retval, "w", "=r"); \ break; \ case 4: \ __get_user_asm(x, ptr, retval, "l", "=r"); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, retval); \ break; \ default: \ (x) = __get_user_bad(); \ } \ } while (0) #define __get_user_asm(x, addr, err, itype, ltype) \ asm volatile("\n" \ "1: mov"itype" %[umem],%[output]\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %[efault],%[errout]\n" \ " xorl %k[output],%k[output]\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 3b) \ : [errout] "=r" (err), \ [output] ltype(x) \ : [umem] "m" (__m(addr)), \ [efault] "i" (-EFAULT), "0" (err)) #endif // CONFIG_CC_ASM_GOTO_OUTPUT /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(struct __large_struct __user *)(x)) /* * Tell gcc we read from memory instead of writing: this is because * we do not write to any memory gcc knows about, so there are no * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ : : label) extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); #ifdef CONFIG_ARCH_HAS_COPY_MC unsigned long __must_check copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check copy_mc_to_user(void *to, const void *from, unsigned len); #endif /* * movsl can be slow when source and dest are not both 8-byte aligned */ #ifdef CONFIG_X86_INTEL_USERCOPY extern struct movsl_mask { int mask; } ____cacheline_aligned_in_smp movsl_mask; #endif #define ARCH_HAS_NOCACHE_UACCESS 1 #ifdef CONFIG_X86_32 # include <asm/uaccess_32.h> #else # include <asm/uaccess_64.h> #endif /* * The "unsafe" user accesses aren't really "unsafe", but the naming * is a big fat warning: you have to not only do the access_ok() * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; __uaccess_begin_nospec(); return 1; } #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() #define user_access_save() smap_save() #define user_access_restore(x) smap_restore(x) #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ if (unlikely(__gu_err)) goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. */ #define unsafe_copy_loop(dst, src, len, type, label) \ while (len >= sizeof(type)) { \ unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } #define unsafe_copy_to_user(_dst,_src,_len,label) \ do { \ char __user *__ucu_dst = (_dst); \ const char *__ucu_src = (_src); \ size_t __ucu_len = (_len); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ } while (0) #define HAVE_GET_KERNEL_NOFAULT #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), err_label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ int __kr_err; \ \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), __kr_err); \ if (unlikely(__kr_err)) \ goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __put_kernel_nofault(dst, src, type, err_label) \ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ sizeof(type), err_label) #endif /* _ASM_X86_UACCESS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPU's in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/atomic.h> #include <linux/bug.h> /* Don't assign or return these: may not be this big! */ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; /** * cpumask_bits - get the bits in a cpumask * @maskp: the struct cpumask * * * You should only assume nr_cpu_ids bits of this mask are valid. This is * a macro so it's const-correct. */ #define cpumask_bits(maskp) ((maskp)->bits) /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if NR_CPUS == 1 #define nr_cpu_ids 1U #else extern unsigned int nr_cpu_ids; #endif #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ #define nr_cpumask_bits nr_cpu_ids #else #define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU id's * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_possible_mask is forced to have * all NR_CPUS bits set, otherwise it is just the set of CPUs that * ACPI reports present at boot. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) extern atomic_t __num_online_cpus; #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ static inline unsigned int num_online_cpus(void) { return atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) #define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) #endif extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } #if NR_CPUS == 1 /* Uniprocessor. Assume all masks are "1". */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return 0; } static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; } /* Valid inputs for n are -1 and 0. */ static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_and(int n, const struct cpumask *srcp, const struct cpumask *andp) { return n+1; } static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */ return (wrap && n == 0); } /* cpu must be a valid cpu, ie 0, so there's no other choice. */ static inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { return 1; } static inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_next_and(-1, src1p, src2p); } #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no cpus set. */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Returns >= nr_cpumask_bits if no CPUs set. */ static inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); } unsigned int cpumask_next(int n, const struct cpumask *srcp); /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (ie. return will be > @n) * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no further cpus unset. */ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next((cpu), (mask)), \ (cpu) < nr_cpu_ids;) /** * for_each_cpu_not - iterate over every cpu in a complemented mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_not(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask poiter * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ (cpu) < nr_cpumask_bits; \ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in @cpumask, else returns 0 */ static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_set_bit wrapper for cpumasks. */ static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_clear_bit wrapper for cpumasks. */ static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_setall(struct cpumask *dstp) { bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_complement - *dstp = ~*srcp * @dstp: the cpumask result * @srcp: the input to invert */ static inline void cpumask_complement(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_equal - *src1p == *src2p * @src1p: the first input * @src2p: the second input */ static inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_or_equal - *src1p | *src2p == *src3p * @src1p: the first input * @src2p: the second input * @src3p: the third input */ static inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), nr_cpumask_bits); } /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input * @src2p: the second input */ static inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_subset - (*src1p & ~*src2p) == 0 * @src1p: the first input * @src2p: the second input * * Returns 1 if *@src1p is a subset of *@src2p, else returns 0 */ static inline int cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. */ static inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. */ static inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. */ static inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_shift_left - *dstp = *srcp << n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_copy - *dstp = *srcp * @dstp: the result * @srcp: the input cpumask */ static inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_any - pick a "random" cpu from *srcp * @srcp: the input cpumask * * Returns >= nr_cpu_ids if no cpus set. */ #define cpumask_any(srcp) cpumask_first(srcp) /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input * @src2p: the second input * * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ #define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) /** * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * * Returns >= nr_cpu_ids if no cpus set. */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) /** * cpumask_of - the cpumask containing just a given cpu * @cpu: the cpu (<= nr_cpu_ids) */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parselist_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_size - size to allocate for a 'struct cpumask' in bytes */ static inline unsigned int cpumask_size(void) { return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long); } /* * cpumask_var_t: struct cpumask for stack usage. * * Oh, the wicked games we play! In order to make kernel coding a * little more difficult, we typedef cpumask_var_t to an array or a * pointer: doing &mask on an array is a noop, so it still works. * * ie. * cpumask_var_t tmpmask; * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) * return -ENOMEM; * * ... use 'tmpmask' like a normal struct cpumask * ... * * free_cpumask_var(tmpmask); * * * However, one notable exception is there. alloc_cpumask_var() allocates * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t. * * cpumask_var_t tmpmask; * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) * return -ENOMEM; * * var = *tmpmask; * * This code makes NR_CPUS length memcopy and brings to a memory corruption. * cpumask_copy() provide safe copy functionality. * * Note that there is another evil here: If you define a cpumask_var_t * as a percpu variable then the way to obtain the address of the cpumask * structure differently influences what this_cpu_* operation needs to be * used. Please use this_cpu_cpumask_var_t in those cases. The direct use * of this_cpu_ptr() or this_cpu_read() will lead to failures when the * other type of cpumask_var_t implementation is configured. * * Please also note that __cpumask_var_read_mostly can be used to declare * a cpumask_var_t variable itself (not its content) as read mostly. */ #ifdef CONFIG_CPUMASK_OFFSTACK typedef struct cpumask *cpumask_var_t; #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); static inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } #else typedef struct cpumask cpumask_var_t[1]; #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } static inline void free_cpumask_var(cpumask_var_t mask) { } static inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } static inline bool cpumask_available(cpumask_var_t mask) { return true; } #endif /* CONFIG_CPUMASK_OFFSTACK */ /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define cpu_all_mask to_cpumask(cpu_all_bits) /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); static inline void reset_cpu_possible_mask(void) { bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); } static inline void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) cpumask_set_cpu(cpu, &__cpu_possible_mask); else cpumask_clear_cpu(cpu, &__cpu_possible_mask); } static inline void set_cpu_present(unsigned int cpu, bool present) { if (present) cpumask_set_cpu(cpu, &__cpu_present_mask); else cpumask_clear_cpu(cpu, &__cpu_present_mask); } void set_cpu_online(unsigned int cpu, bool online); static inline void set_cpu_active(unsigned int cpu, bool active) { if (active) cpumask_set_cpu(cpu, &__cpu_active_mask); else cpumask_clear_cpu(cpu, &__cpu_active_mask); } /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * * There are a few places where cpumask_var_t isn't appropriate and * static cpumasks must be used (eg. very early boot), yet we don't * expose the definition of 'struct cpumask'. * * This does the conversion, and can be used as a constant initializer. */ #define to_cpumask(bitmap) \ ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) static inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return to_cpumask(p); } #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG #define CPU_BITS_ALL \ { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #endif /* NR_CPUS > BITS_PER_LONG */ /** * cpumap_print_to_pagebuf - copies the cpumask into the buffer either * as comma-separated list of cpus or hex values of cpumask * @list: indicates whether the cpumap must be list * @mask: the cpumask to copy * @buf: the buffer to copy into * * Returns the length of the (null-terminated) @buf string, zero if * nothing is copied. */ static inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), nr_cpu_ids); } #if NR_CPUS <= BITS_PER_LONG #define CPU_MASK_ALL \ (cpumask_t) { { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #else #define CPU_MASK_ALL \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #endif /* NR_CPUS > BITS_PER_LONG */ #define CPU_MASK_NONE \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } } #define CPU_MASK_CPU0 \ (cpumask_t) { { \ [0] = 1UL \ } } #endif /* __LINUX_CPUMASK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 /* SPDX-License-Identifier: GPL-2.0 */ /* * This file provides wrappers with sanitizer instrumentation for non-atomic * bit operations. * * To use this functionality, an arch's bitops.h file needs to define each of * the below bit operations with an arch_ prefix (e.g. arch_set_bit(), * arch___set_bit(), etc.). */ #ifndef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H #define _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H #include <linux/instrumented.h> /** * __set_bit - Set a bit in memory * @nr: the bit to set * @addr: the address to start counting from * * Unlike set_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static inline void __set_bit(long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___set_bit(nr, addr); } /** * __clear_bit - Clears a bit in memory * @nr: the bit to clear * @addr: the address to start counting from * * Unlike clear_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static inline void __clear_bit(long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___clear_bit(nr, addr); } /** * __change_bit - Toggle a bit in memory * @nr: the bit to change * @addr: the address to start counting from * * Unlike change_bit(), this function is non-atomic. If it is called on the same * region of memory concurrently, the effect may be that only one operation * succeeds. */ static inline void __change_bit(long nr, volatile unsigned long *addr) { instrument_write(addr + BIT_WORD(nr), sizeof(long)); arch___change_bit(nr, addr); } static inline void __instrument_read_write_bitop(long nr, volatile unsigned long *addr) { if (IS_ENABLED(CONFIG_KCSAN_ASSUME_PLAIN_WRITES_ATOMIC)) { /* * We treat non-atomic read-write bitops a little more special. * Given the operations here only modify a single bit, assuming * non-atomicity of the writer is sufficient may be reasonable * for certain usage (and follows the permissible nature of the * assume-plain-writes-atomic rule): * 1. report read-modify-write races -> check read; * 2. do not report races with marked readers, but do report * races with unmarked readers -> check "atomic" write. */ kcsan_check_read(addr + BIT_WORD(nr), sizeof(long)); /* * Use generic write instrumentation, in case other sanitizers * or tools are enabled alongside KCSAN. */ instrument_write(addr + BIT_WORD(nr), sizeof(long)); } else { instrument_read_write(addr + BIT_WORD(nr), sizeof(long)); } } /** * __test_and_set_bit - Set a bit and return its old value * @nr: Bit to set * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_set_bit(nr, addr); } /** * __test_and_clear_bit - Clear a bit and return its old value * @nr: Bit to clear * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_clear_bit(nr, addr); } /** * __test_and_change_bit - Change a bit and return its old value * @nr: Bit to change * @addr: Address to count from * * This operation is non-atomic. If two instances of this operation race, one * can appear to succeed but actually fail. */ static inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) { __instrument_read_write_bitop(nr, addr); return arch___test_and_change_bit(nr, addr); } /** * test_bit - Determine whether a bit is set * @nr: bit number to test * @addr: Address to start counting from */ static inline bool test_bit(long nr, const volatile unsigned long *addr) { instrument_atomic_read(addr + BIT_WORD(nr), sizeof(long)); return arch_test_bit(nr, addr); } #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_GFP_H #define __LINUX_GFP_H #include <linux/mmdebug.h> #include <linux/mmzone.h> #include <linux/stddef.h> #include <linux/linkage.h> #include <linux/topology.h> struct vm_area_struct; /* * In case of changes, please don't forget to update * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c */ /* Plain integer GFP bitmasks. Do not use this directly. */ #define ___GFP_DMA 0x01u #define ___GFP_HIGHMEM 0x02u #define ___GFP_DMA32 0x04u #define ___GFP_MOVABLE 0x08u #define ___GFP_RECLAIMABLE 0x10u #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u #define ___GFP_ZERO 0x100u #define ___GFP_ATOMIC 0x200u #define ___GFP_DIRECT_RECLAIM 0x400u #define ___GFP_KSWAPD_RECLAIM 0x800u #define ___GFP_WRITE 0x1000u #define ___GFP_NOWARN 0x2000u #define ___GFP_RETRY_MAYFAIL 0x4000u #define ___GFP_NOFAIL 0x8000u #define ___GFP_NORETRY 0x10000u #define ___GFP_MEMALLOC 0x20000u #define ___GFP_COMP 0x40000u #define ___GFP_NOMEMALLOC 0x80000u #define ___GFP_HARDWALL 0x100000u #define ___GFP_THISNODE 0x200000u #define ___GFP_ACCOUNT 0x400000u #ifdef CONFIG_LOCKDEP #define ___GFP_NOLOCKDEP 0x800000u #else #define ___GFP_NOLOCKDEP 0 #endif /* If the above are modified, __GFP_BITS_SHIFT may need updating */ /* * Physical address zone modifiers (see linux/mmzone.h - low four bits) * * Do not put any conditional on these. If necessary modify the definitions * without the underscores and use them consistently. The definitions here may * be used in bit comparisons. */ #define __GFP_DMA ((__force gfp_t)___GFP_DMA) #define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) #define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) #define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ #define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) /** * DOC: Page mobility and placement hints * * Page mobility and placement hints * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * These flags provide hints about how mobile the page is. Pages with similar * mobility are placed within the same pageblocks to minimise problems due * to external fragmentation. * * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be * moved by page migration during memory compaction or can be reclaimed. * * %__GFP_RECLAIMABLE is used for slab allocations that specify * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers. * * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible, * these pages will be spread between local zones to avoid all the dirty * pages being in one zone (fair zone allocation policy). * * %__GFP_HARDWALL enforces the cpuset memory allocation policy. * * %__GFP_THISNODE forces the allocation to be satisfied from the requested * node with no fallbacks or placement policy enforcements. * * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) #define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) /** * DOC: Watermark modifiers * * Watermark modifiers -- controls access to emergency reserves * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * %__GFP_HIGH indicates that the caller is high-priority and that granting * the request is necessary before the system can make forward progress. * For example, creating an IO context to clean pages. * * %__GFP_ATOMIC indicates that the caller cannot reclaim or sleep and is * high priority. Users are typically interrupt handlers. This may be * used in conjunction with %__GFP_HIGH * * %__GFP_MEMALLOC allows access to all memory. This should only be used when * the caller guarantees the allocation will allow more memory to be freed * very shortly e.g. process exiting or swapping. Users either should * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). * Users of this flag have to be extremely careful to not deplete the reserve * completely and implement a throttling mechanism which controls the * consumption of the reserve based on the amount of freed memory. * Usage of a pre-allocated pool (e.g. mempool) should be always considered * before using this flag. * * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. * This takes precedence over the %__GFP_MEMALLOC flag if both are set. */ #define __GFP_ATOMIC ((__force gfp_t)___GFP_ATOMIC) #define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) #define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) #define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /** * DOC: Reclaim modifiers * * Reclaim modifiers * ~~~~~~~~~~~~~~~~~ * Please note that all the following flags are only applicable to sleepable * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them). * * %__GFP_IO can start physical IO. * * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the * allocator recursing into the filesystem which might already be holding * locks. * * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim. * This flag can be cleared to avoid unnecessary delays when a fallback * option is available. * * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when * the low watermark is reached and have it reclaim pages until the high * watermark is reached. A caller may wish to clear this flag when fallback * options are available and the reclaim is likely to disrupt the system. The * canonical example is THP allocation where a fallback is cheap but * reclaim/compaction may cause indirect stalls. * * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. * * The default allocator behavior depends on the request size. We have a concept * of so called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). * !costly allocations are too essential to fail so they are implicitly * non-failing by default (with some exceptions like OOM victims might fail so * the caller still has to check for failures) while costly requests try to be * not disruptive and back off even without invoking the OOM killer. * The following three modifiers might be used to override some of these * implicit rules * * %__GFP_NORETRY: The VM implementation will try only very lightweight * memory direct reclaim to get some memory under memory pressure (thus * it can sleep). It will avoid disruptive actions like OOM killer. The * caller must handle the failure which is quite likely to happen under * heavy memory pressure. The flag is suitable when failure can easily be * handled at small cost, such as reduced throughput * * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim * procedures that have previously failed if there is some indication * that progress has been made else where. It can wait for other * tasks to attempt high level approaches to freeing memory such as * compaction (which removes fragmentation) and page-out. * There is still a definite limit to the number of retries, but it is * a larger limit than with %__GFP_NORETRY. * Allocations with this flag may fail, but only when there is * genuinely little unused memory. While these allocations do not * directly trigger the OOM killer, their failure indicates that * the system is likely to need to use the OOM killer soon. The * caller must handle failure, but can reasonably do so by failing * a higher-level request, or completing it only in a much less * efficient manner. * If the allocation does fail, and the caller is in a position to * free some non-essential memory, doing so could benefit the system * as a whole. * * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller * cannot handle allocation failures. The allocation could block * indefinitely but will never return with failure. Testing for * failure is pointless. * New users should be evaluated carefully (and the flag should be * used only when there is no reasonable failure policy) but it is * definitely preferable to use the flag rather than opencode endless * loop around allocator. * Using this flag for costly allocations is _highly_ discouraged. */ #define __GFP_IO ((__force gfp_t)___GFP_IO) #define __GFP_FS ((__force gfp_t)___GFP_FS) #define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ #define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ #define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) #define __GFP_RETRY_MAYFAIL ((__force gfp_t)___GFP_RETRY_MAYFAIL) #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /** * DOC: Action modifiers * * Action modifiers * ~~~~~~~~~~~~~~~~ * * %__GFP_NOWARN suppresses allocation failure reports. * * %__GFP_COMP address compound page metadata. * * %__GFP_ZERO returns a zeroed page on success. */ #define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) #define __GFP_COMP ((__force gfp_t)___GFP_COMP) #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Disable lockdep for GFP context tracking */ #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ #define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /** * DOC: Useful GFP flag combinations * * Useful GFP flag combinations * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * * Useful GFP flag combinations that are commonly used. It is recommended * that subsystems start with one of these combinations and then set/clear * %__GFP_FOO flags as necessary. * * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower * watermark is applied to allow access to "atomic reserves". * The current implementation doesn't support NMI and few other strict * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. * * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. * * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is * accounted to kmemcg. * * %GFP_NOWAIT is for kernel allocations that should not stall for direct * reclaim, start physical IO or use any filesystem callback. * * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages * that do not require the starting of any physical IO. * Please try to avoid using this flag directly and instead use * memalloc_noio_{save,restore} to mark the whole scope which cannot * perform any IO with a short explanation why. All allocation requests * will inherit GFP_NOIO implicitly. * * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. * Please try to avoid using this flag directly and instead use * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't * recurse into the FS layer with a short explanation why. All allocation * requests will inherit GFP_NOFS implicitly. * * %GFP_USER is for userspace allocations that also need to be directly * accessibly by the kernel or hardware. It is typically used by hardware * for buffers that are mapped to userspace (e.g. graphics) that hardware * still must DMA to. cpuset limits are enforced for these allocations. * * %GFP_DMA exists for historical reasons and should be avoided where possible. * The flags indicates that the caller requires that the lowest zone be * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but * it would require careful auditing as some users really require it and * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the * lowest zone as a type of emergency reserve. * * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit * address. * * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, * do not need to be directly accessible by the kernel but that cannot * move once in use. An example may be a hardware allocation that maps * data directly into userspace but has no addressing limitations. * * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not * need direct access to but can use kmap() when access is required. They * are expected to be movable via page reclaim or page migration. Typically, * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE. * * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They * are compound allocations that will generally fail quickly if memory is not * available and will not wake kswapd/kcompactd on failure. The _LIGHT * version does not attempt reclaim/compaction at all and is by default used * in page fault path, while the non-light is used by khugepaged. */ #define GFP_ATOMIC (__GFP_HIGH|__GFP_ATOMIC|__GFP_KSWAPD_RECLAIM) #define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) #define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) #define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM) #define GFP_NOIO (__GFP_RECLAIM) #define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) #define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) #define GFP_DMA __GFP_DMA #define GFP_DMA32 __GFP_DMA32 #define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) #define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE) #define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) #define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) /* Convert GFP flags to their corresponding migrate type */ #define GFP_MOVABLE_MASK (__GFP_RECLAIMABLE|__GFP_MOVABLE) #define GFP_MOVABLE_SHIFT 3 static inline int gfp_migratetype(const gfp_t gfp_flags) { VM_WARN_ON((gfp_flags & GFP_MOVABLE_MASK) == GFP_MOVABLE_MASK); BUILD_BUG_ON((1UL << GFP_MOVABLE_SHIFT) != ___GFP_MOVABLE); BUILD_BUG_ON((___GFP_MOVABLE >> GFP_MOVABLE_SHIFT) != MIGRATE_MOVABLE); if (unlikely(page_group_by_mobility_disabled)) return MIGRATE_UNMOVABLE; /* Group based on mobility */ return (gfp_flags & GFP_MOVABLE_MASK) >> GFP_MOVABLE_SHIFT; } #undef GFP_MOVABLE_MASK #undef GFP_MOVABLE_SHIFT static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) { return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } /** * gfpflags_normal_context - is gfp_flags a normal sleepable context? * @gfp_flags: gfp_flags to test * * Test whether @gfp_flags indicates that the allocation is from the * %current context and allowed to sleep. * * An allocation being allowed to block doesn't mean it owns the %current * context. When direct reclaim path tries to allocate memory, the * allocation context is nested inside whatever %current was doing at the * time of the original allocation. The nested allocation may be allowed * to block but modifying anything %current owns can corrupt the outer * context's expectations. * * %true result from this function indicates that the allocation context * can sleep and use anything that's associated with %current. */ static inline bool gfpflags_normal_context(const gfp_t gfp_flags) { return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) == __GFP_DIRECT_RECLAIM; } #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else #define OPT_ZONE_HIGHMEM ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA #define OPT_ZONE_DMA ZONE_DMA #else #define OPT_ZONE_DMA ZONE_NORMAL #endif #ifdef CONFIG_ZONE_DMA32 #define OPT_ZONE_DMA32 ZONE_DMA32 #else #define OPT_ZONE_DMA32 ZONE_NORMAL #endif /* * GFP_ZONE_TABLE is a word size bitstring that is used for looking up the * zone to use given the lowest 4 bits of gfp_t. Entries are GFP_ZONES_SHIFT * bits long and there are 16 of them to cover all possible combinations of * __GFP_DMA, __GFP_DMA32, __GFP_MOVABLE and __GFP_HIGHMEM. * * The zone fallback order is MOVABLE=>HIGHMEM=>NORMAL=>DMA32=>DMA. * But GFP_MOVABLE is not only a zone specifier but also an allocation * policy. Therefore __GFP_MOVABLE plus another zone selector is valid. * Only 1 bit of the lowest 3 bits (DMA,DMA32,HIGHMEM) can be set to "1". * * bit result * ================= * 0x0 => NORMAL * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) * 0x8 => NORMAL (MOVABLE+0) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) * * GFP_ZONES_SHIFT must be <= 2 on 32 bit platforms. */ #if defined(CONFIG_ZONE_DEVICE) && (MAX_NR_ZONES-1) <= 4 /* ZONE_DEVICE is not a valid GFP zone specifier */ #define GFP_ZONES_SHIFT 2 #else #define GFP_ZONES_SHIFT ZONES_SHIFT #endif #if 16 * GFP_ZONES_SHIFT > BITS_PER_LONG #error GFP_ZONES_SHIFT too large to create GFP_ZONE_TABLE integer #endif #define GFP_ZONE_TABLE ( \ (ZONE_NORMAL << 0 * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << ___GFP_DMA * GFP_ZONES_SHIFT) \ | (OPT_ZONE_HIGHMEM << ___GFP_HIGHMEM * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA32 << ___GFP_DMA32 * GFP_ZONES_SHIFT) \ | (ZONE_NORMAL << ___GFP_MOVABLE * GFP_ZONES_SHIFT) \ | (OPT_ZONE_DMA << (___GFP_MOVABLE | ___GFP_DMA) * GFP_ZONES_SHIFT) \ | (ZONE_MOVABLE << (___GFP_MOVABLE | ___GFP_HIGHMEM) * GFP_ZONES_SHIFT)\ | (OPT_ZONE_DMA32 << (___GFP_MOVABLE | ___GFP_DMA32) * GFP_ZONES_SHIFT)\ ) /* * GFP_ZONE_BAD is a bitmap for all combinations of __GFP_DMA, __GFP_DMA32 * __GFP_HIGHMEM and __GFP_MOVABLE that are not permitted. One flag per * entry starting with bit 0. Bit is set if the combination is not * allowed. */ #define GFP_ZONE_BAD ( \ 1 << (___GFP_DMA | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32) \ | 1 << (___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_DMA | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_HIGHMEM | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_HIGHMEM) \ | 1 << (___GFP_MOVABLE | ___GFP_DMA32 | ___GFP_DMA | ___GFP_HIGHMEM) \ ) static inline enum zone_type gfp_zone(gfp_t flags) { enum zone_type z; int bit = (__force int) (flags & GFP_ZONEMASK); z = (GFP_ZONE_TABLE >> (bit * GFP_ZONES_SHIFT)) & ((1 << GFP_ZONES_SHIFT) - 1); VM_BUG_ON((GFP_ZONE_BAD >> bit) & 1); return z; } /* * There is only one page-allocator function, and two main namespaces to * it. The alloc_page*() variants return 'struct page *' and as such * can allocate highmem pages, the *get*page*() variants return * virtual kernel addresses to the allocated page(s). */ static inline int gfp_zonelist(gfp_t flags) { #ifdef CONFIG_NUMA if (unlikely(flags & __GFP_THISNODE)) return ZONELIST_NOFALLBACK; #endif return ZONELIST_FALLBACK; } /* * We get the zone list from the current node and the gfp_mask. * This zone list contains a maximum of MAXNODES*MAX_NR_ZONES zones. * There are two zonelists per node, one for all zones with memory and * one containing just zones from the node the zonelist belongs to. * * For the normal case of non-DISCONTIGMEM systems the NODE_DATA() gets * optimized to &contig_page_data at compile-time. */ static inline struct zonelist *node_zonelist(int nid, gfp_t flags) { return NODE_DATA(nid)->node_zonelists + gfp_zonelist(flags); } #ifndef HAVE_ARCH_FREE_PAGE static inline void arch_free_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_ALLOC_PAGE static inline void arch_alloc_page(struct page *page, int order) { } #endif #ifndef HAVE_ARCH_MAKE_PAGE_ACCESSIBLE static inline int arch_make_page_accessible(struct page *page) { return 0; } #endif struct page * __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, nodemask_t *nodemask); static inline struct page * __alloc_pages(gfp_t gfp_mask, unsigned int order, int preferred_nid) { return __alloc_pages_nodemask(gfp_mask, order, preferred_nid, NULL); } /* * Allocate pages, preferring the node given as nid. The node must be valid and * online. For more general interface, see alloc_pages_node(). */ static inline struct page * __alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES); VM_WARN_ON((gfp_mask & __GFP_THISNODE) && !node_online(nid)); return __alloc_pages(gfp_mask, order, nid); } /* * Allocate pages, preferring the node given as nid. When nid == NUMA_NO_NODE, * prefer the current CPU's closest node. Otherwise node must be valid and * online. */ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask, unsigned int order) { if (nid == NUMA_NO_NODE) nid = numa_mem_id(); return __alloc_pages_node(nid, gfp_mask, order); } #ifdef CONFIG_NUMA extern struct page *alloc_pages_current(gfp_t gfp_mask, unsigned order); static inline struct page * alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, int node, bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_node(numa_node_id(), gfp_mask, order); } #define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); void *alloc_pages_exact(size_t size, gfp_t gfp_mask); void free_pages_exact(void *virt, size_t size); void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); #define __get_free_page(gfp_mask) \ __get_free_pages((gfp_mask), 0) #define __get_dma_pages(gfp_mask, order) \ __get_free_pages((gfp_mask) | GFP_DMA, (order)) extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); extern void free_unref_page(struct page *page); extern void free_unref_page_list(struct list_head *list); struct page_frag_cache; extern void __page_frag_cache_drain(struct page *page, unsigned int count); extern void *page_frag_alloc(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void page_frag_free(void *addr); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) void page_alloc_init(void); void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp); void drain_all_pages(struct zone *zone); void drain_local_pages(struct zone *zone); void page_alloc_init_late(void); /* * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what * GFP flags are used before interrupts are enabled. Once interrupts are * enabled, it is set to __GFP_BITS_MASK while the system is running. During * hibernation, it is used by PM to avoid I/O during memory allocation while * devices are suspended. */ extern gfp_t gfp_allowed_mask; /* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); #ifdef CONFIG_PM_SLEEP extern bool pm_suspended_storage(void); #else static inline bool pm_suspended_storage(void) { return false; } #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_CONTIG_ALLOC /* The below functions must be run on a range from a single zone. */ extern int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype, gfp_t gfp_mask); extern struct page *alloc_contig_pages(unsigned long nr_pages, gfp_t gfp_mask, int nid, nodemask_t *nodemask); #endif void free_contig_range(unsigned long pfn, unsigned int nr_pages); #ifdef CONFIG_CMA /* CMA stuff */ extern void init_cma_reserved_pageblock(struct page *page); #endif #endif /* __LINUX_GFP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EMe(WB_REASON_FORKER_THREAD, "forker_thread") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_page_template, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_page_template, writeback_dirty_page, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping) ); DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct page *page, struct bdi_writeback *wb), TP_ARGS(page, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = page_mapping(page); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, bdi_setpoint) __field(unsigned long, bdi_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); __entry->bdi_dirty = bdi_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->bdi_setpoint, __entry->bdi_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed), TP_STRUCT__entry( __field( unsigned int, usec_timeout ) __field( unsigned int, usec_delayed ) ), TP_fast_assign( __entry->usec_timeout = usec_timeout; __entry->usec_delayed = usec_delayed; ), TP_printk("usec_timeout=%u usec_delayed=%u", __entry->usec_timeout, __entry->usec_delayed) ); DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed) ); DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright 2019 Google LLC */ #ifndef __LINUX_BLK_CRYPTO_H #define __LINUX_BLK_CRYPTO_H #include <linux/types.h> enum blk_crypto_mode_num { BLK_ENCRYPTION_MODE_INVALID, BLK_ENCRYPTION_MODE_AES_256_XTS, BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV, BLK_ENCRYPTION_MODE_ADIANTUM, BLK_ENCRYPTION_MODE_MAX, }; #define BLK_CRYPTO_MAX_KEY_SIZE 64 /** * struct blk_crypto_config - an inline encryption key's crypto configuration * @crypto_mode: encryption algorithm this key is for * @data_unit_size: the data unit size for all encryption/decryptions with this * key. This is the size in bytes of each individual plaintext and * ciphertext. This is always a power of 2. It might be e.g. the * filesystem block size or the disk sector size. * @dun_bytes: the maximum number of bytes of DUN used when using this key */ struct blk_crypto_config { enum blk_crypto_mode_num crypto_mode; unsigned int data_unit_size; unsigned int dun_bytes; }; /** * struct blk_crypto_key - an inline encryption key * @crypto_cfg: the crypto configuration (like crypto_mode, key size) for this * key * @data_unit_size_bits: log2 of data_unit_size * @size: size of this key in bytes (determined by @crypto_cfg.crypto_mode) * @raw: the raw bytes of this key. Only the first @size bytes are used. * * A blk_crypto_key is immutable once created, and many bios can reference it at * the same time. It must not be freed until all bios using it have completed * and it has been evicted from all devices on which it may have been used. */ struct blk_crypto_key { struct blk_crypto_config crypto_cfg; unsigned int data_unit_size_bits; unsigned int size; u8 raw[BLK_CRYPTO_MAX_KEY_SIZE]; }; #define BLK_CRYPTO_MAX_IV_SIZE 32 #define BLK_CRYPTO_DUN_ARRAY_SIZE (BLK_CRYPTO_MAX_IV_SIZE / sizeof(u64)) /** * struct bio_crypt_ctx - an inline encryption context * @bc_key: the key, algorithm, and data unit size to use * @bc_dun: the data unit number (starting IV) to use * * A bio_crypt_ctx specifies that the contents of the bio will be encrypted (for * write requests) or decrypted (for read requests) inline by the storage device * or controller, or by the crypto API fallback. */ struct bio_crypt_ctx { const struct blk_crypto_key *bc_key; u64 bc_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; }; #include <linux/blk_types.h> #include <linux/blkdev.h> struct request; struct request_queue; #ifdef CONFIG_BLK_INLINE_ENCRYPTION static inline bool bio_has_crypt_ctx(struct bio *bio) { return bio->bi_crypt_context; } void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask); bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, unsigned int bytes, const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]); int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, enum blk_crypto_mode_num crypto_mode, unsigned int dun_bytes, unsigned int data_unit_size); int blk_crypto_start_using_key(const struct blk_crypto_key *key, struct request_queue *q); int blk_crypto_evict_key(struct request_queue *q, const struct blk_crypto_key *key); bool blk_crypto_config_supported(struct request_queue *q, const struct blk_crypto_config *cfg); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool bio_has_crypt_ctx(struct bio *bio) { return false; } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask); /** * bio_crypt_clone - clone bio encryption context * @dst: destination bio * @src: source bio * @gfp_mask: memory allocation flags * * If @src has an encryption context, clone it to @dst. * * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ static inline int bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { if (bio_has_crypt_ctx(src)) return __bio_crypt_clone(dst, src, gfp_mask); return 0; } #endif /* __LINUX_BLK_CRYPTO_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _linux_POSIX_TIMERS_H #define _linux_POSIX_TIMERS_H #include <linux/spinlock.h> #include <linux/list.h> #include <linux/alarmtimer.h> #include <linux/timerqueue.h> #include <linux/task_work.h> struct kernel_siginfo; struct task_struct; /* * Bit fields within a clockid: * * The most significant 29 bits hold either a pid or a file descriptor. * * Bit 2 indicates whether a cpu clock refers to a thread or a process. * * Bits 1 and 0 give the type: PROF=0, VIRT=1, SCHED=2, or FD=3. * * A clockid is invalid if bits 2, 1, and 0 are all set. */ #define CPUCLOCK_PID(clock) ((pid_t) ~((clock) >> 3)) #define CPUCLOCK_PERTHREAD(clock) \ (((clock) & (clockid_t) CPUCLOCK_PERTHREAD_MASK) != 0) #define CPUCLOCK_PERTHREAD_MASK 4 #define CPUCLOCK_WHICH(clock) ((clock) & (clockid_t) CPUCLOCK_CLOCK_MASK) #define CPUCLOCK_CLOCK_MASK 3 #define CPUCLOCK_PROF 0 #define CPUCLOCK_VIRT 1 #define CPUCLOCK_SCHED 2 #define CPUCLOCK_MAX 3 #define CLOCKFD CPUCLOCK_MAX #define CLOCKFD_MASK (CPUCLOCK_PERTHREAD_MASK|CPUCLOCK_CLOCK_MASK) static inline clockid_t make_process_cpuclock(const unsigned int pid, const clockid_t clock) { return ((~pid) << 3) | clock; } static inline clockid_t make_thread_cpuclock(const unsigned int tid, const clockid_t clock) { return make_process_cpuclock(tid, clock | CPUCLOCK_PERTHREAD_MASK); } static inline clockid_t fd_to_clockid(const int fd) { return make_process_cpuclock((unsigned int) fd, CLOCKFD); } static inline int clockid_to_fd(const clockid_t clk) { return ~(clk >> 3); } #ifdef CONFIG_POSIX_TIMERS /** * cpu_timer - Posix CPU timer representation for k_itimer * @node: timerqueue node to queue in the task/sig * @head: timerqueue head on which this timer is queued * @task: Pointer to target task * @elist: List head for the expiry list * @firing: Timer is currently firing */ struct cpu_timer { struct timerqueue_node node; struct timerqueue_head *head; struct pid *pid; struct list_head elist; int firing; }; static inline bool cpu_timer_enqueue(struct timerqueue_head *head, struct cpu_timer *ctmr) { ctmr->head = head; return timerqueue_add(head, &ctmr->node); } static inline void cpu_timer_dequeue(struct cpu_timer *ctmr) { if (ctmr->head) { timerqueue_del(ctmr->head, &ctmr->node); ctmr->head = NULL; } } static inline u64 cpu_timer_getexpires(struct cpu_timer *ctmr) { return ctmr->node.expires; } static inline void cpu_timer_setexpires(struct cpu_timer *ctmr, u64 exp) { ctmr->node.expires = exp; } /** * posix_cputimer_base - Container per posix CPU clock * @nextevt: Earliest-expiration cache * @tqhead: timerqueue head for cpu_timers */ struct posix_cputimer_base { u64 nextevt; struct timerqueue_head tqhead; }; /** * posix_cputimers - Container for posix CPU timer related data * @bases: Base container for posix CPU clocks * @timers_active: Timers are queued. * @expiry_active: Timer expiry is active. Used for * process wide timers to avoid multiple * task trying to handle expiry concurrently * * Used in task_struct and signal_struct */ struct posix_cputimers { struct posix_cputimer_base bases[CPUCLOCK_MAX]; unsigned int timers_active; unsigned int expiry_active; }; /** * posix_cputimers_work - Container for task work based posix CPU timer expiry * @work: The task work to be scheduled * @scheduled: @work has been scheduled already, no further processing */ struct posix_cputimers_work { struct callback_head work; unsigned int scheduled; }; static inline void posix_cputimers_init(struct posix_cputimers *pct) { memset(pct, 0, sizeof(*pct)); pct->bases[0].nextevt = U64_MAX; pct->bases[1].nextevt = U64_MAX; pct->bases[2].nextevt = U64_MAX; } void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit); static inline void posix_cputimers_rt_watchdog(struct posix_cputimers *pct, u64 runtime) { pct->bases[CPUCLOCK_SCHED].nextevt = runtime; } /* Init task static initializer */ #define INIT_CPU_TIMERBASE(b) { \ .nextevt = U64_MAX, \ } #define INIT_CPU_TIMERBASES(b) { \ INIT_CPU_TIMERBASE(b[0]), \ INIT_CPU_TIMERBASE(b[1]), \ INIT_CPU_TIMERBASE(b[2]), \ } #define INIT_CPU_TIMERS(s) \ .posix_cputimers = { \ .bases = INIT_CPU_TIMERBASES(s.posix_cputimers.bases), \ }, #else struct posix_cputimers { }; struct cpu_timer { }; #define INIT_CPU_TIMERS(s) static inline void posix_cputimers_init(struct posix_cputimers *pct) { } static inline void posix_cputimers_group_init(struct posix_cputimers *pct, u64 cpu_limit) { } #endif #ifdef CONFIG_POSIX_CPU_TIMERS_TASK_WORK void posix_cputimers_init_work(void); #else static inline void posix_cputimers_init_work(void) { } #endif #define REQUEUE_PENDING 1 /** * struct k_itimer - POSIX.1b interval timer structure. * @list: List head for binding the timer to signals->posix_timers * @t_hash: Entry in the posix timer hash table * @it_lock: Lock protecting the timer * @kclock: Pointer to the k_clock struct handling this timer * @it_clock: The posix timer clock id * @it_id: The posix timer id for identifying the timer * @it_active: Marker that timer is active * @it_overrun: The overrun counter for pending signals * @it_overrun_last: The overrun at the time of the last delivered signal * @it_requeue_pending: Indicator that timer waits for being requeued on * signal delivery * @it_sigev_notify: The notify word of sigevent struct for signal delivery * @it_interval: The interval for periodic timers * @it_signal: Pointer to the creators signal struct * @it_pid: The pid of the process/task targeted by the signal * @it_process: The task to wakeup on clock_nanosleep (CPU timers) * @sigq: Pointer to preallocated sigqueue * @it: Union representing the various posix timer type * internals. * @rcu: RCU head for freeing the timer. */ struct k_itimer { struct list_head list; struct hlist_node t_hash; spinlock_t it_lock; const struct k_clock *kclock; clockid_t it_clock; timer_t it_id; int it_active; s64 it_overrun; s64 it_overrun_last; int it_requeue_pending; int it_sigev_notify; ktime_t it_interval; struct signal_struct *it_signal; union { struct pid *it_pid; struct task_struct *it_process; }; struct sigqueue *sigq; union { struct { struct hrtimer timer; } real; struct cpu_timer cpu; struct { struct alarm alarmtimer; } alarm; } it; struct rcu_head rcu; }; void run_posix_cpu_timers(void); void posix_cpu_timers_exit(struct task_struct *task); void posix_cpu_timers_exit_group(struct task_struct *task); void set_process_cpu_timer(struct task_struct *task, unsigned int clock_idx, u64 *newval, u64 *oldval); void update_rlimit_cpu(struct task_struct *task, unsigned long rlim_new); void posixtimer_rearm(struct kernel_siginfo *info); #endif
5 5 5 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 #ifndef _LINUX_JHASH_H #define _LINUX_JHASH_H /* jhash.h: Jenkins hash support. * * Copyright (C) 2006. Bob Jenkins (bob_jenkins@burtleburtle.net) * * https://burtleburtle.net/bob/hash/ * * These are the credits from Bob's sources: * * lookup3.c, by Bob Jenkins, May 2006, Public Domain. * * These are functions for producing 32-bit hashes for hash table lookup. * hashword(), hashlittle(), hashlittle2(), hashbig(), mix(), and final() * are externally useful functions. Routines to test the hash are included * if SELF_TEST is defined. You can use this free for any purpose. It's in * the public domain. It has no warranty. * * Copyright (C) 2009-2010 Jozsef Kadlecsik (kadlec@netfilter.org) * * I've modified Bob's hash to be useful in the Linux kernel, and * any bugs present are my fault. * Jozsef */ #include <linux/bitops.h> #include <linux/unaligned/packed_struct.h> /* Best hash sizes are of power of two */ #define jhash_size(n) ((u32)1<<(n)) /* Mask the hash value, i.e (value & jhash_mask(n)) instead of (value % n) */ #define jhash_mask(n) (jhash_size(n)-1) /* __jhash_mix -- mix 3 32-bit values reversibly. */ #define __jhash_mix(a, b, c) \ { \ a -= c; a ^= rol32(c, 4); c += b; \ b -= a; b ^= rol32(a, 6); a += c; \ c -= b; c ^= rol32(b, 8); b += a; \ a -= c; a ^= rol32(c, 16); c += b; \ b -= a; b ^= rol32(a, 19); a += c; \ c -= b; c ^= rol32(b, 4); b += a; \ } /* __jhash_final - final mixing of 3 32-bit values (a,b,c) into c */ #define __jhash_final(a, b, c) \ { \ c ^= b; c -= rol32(b, 14); \ a ^= c; a -= rol32(c, 11); \ b ^= a; b -= rol32(a, 25); \ c ^= b; c -= rol32(b, 16); \ a ^= c; a -= rol32(c, 4); \ b ^= a; b -= rol32(a, 14); \ c ^= b; c -= rol32(b, 24); \ } /* An arbitrary initial parameter */ #define JHASH_INITVAL 0xdeadbeef /* jhash - hash an arbitrary key * @k: sequence of bytes as key * @length: the length of the key * @initval: the previous hash, or an arbitray value * * The generic version, hashes an arbitrary sequence of bytes. * No alignment or length assumptions are made about the input key. * * Returns the hash value of the key. The result depends on endianness. */ static inline u32 jhash(const void *key, u32 length, u32 initval) { u32 a, b, c; const u8 *k = key; /* Set up the internal state */ a = b = c = JHASH_INITVAL + length + initval; /* All but the last block: affect some 32 bits of (a,b,c) */ while (length > 12) { a += __get_unaligned_cpu32(k); b += __get_unaligned_cpu32(k + 4); c += __get_unaligned_cpu32(k + 8); __jhash_mix(a, b, c); length -= 12; k += 12; } /* Last block: affect all 32 bits of (c) */ switch (length) { case 12: c += (u32)k[11]<<24; fallthrough; case 11: c += (u32)k[10]<<16; fallthrough; case 10: c += (u32)k[9]<<8; fallthrough; case 9: c += k[8]; fallthrough; case 8: b += (u32)k[7]<<24; fallthrough; case 7: b += (u32)k[6]<<16; fallthrough; case 6: b += (u32)k[5]<<8; fallthrough; case 5: b += k[4]; fallthrough; case 4: a += (u32)k[3]<<24; fallthrough; case 3: a += (u32)k[2]<<16; fallthrough; case 2: a += (u32)k[1]<<8; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* jhash2 - hash an array of u32's * @k: the key which must be an array of u32's * @length: the number of u32's in the key * @initval: the previous hash, or an arbitray value * * Returns the hash value of the key. */ static inline u32 jhash2(const u32 *k, u32 length, u32 initval) { u32 a, b, c; /* Set up the internal state */ a = b = c = JHASH_INITVAL + (length<<2) + initval; /* Handle most of the key */ while (length > 3) { a += k[0]; b += k[1]; c += k[2]; __jhash_mix(a, b, c); length -= 3; k += 3; } /* Handle the last 3 u32's */ switch (length) { case 3: c += k[2]; fallthrough; case 2: b += k[1]; fallthrough; case 1: a += k[0]; __jhash_final(a, b, c); break; case 0: /* Nothing left to add */ break; } return c; } /* __jhash_nwords - hash exactly 3, 2 or 1 word(s) */ static inline u32 __jhash_nwords(u32 a, u32 b, u32 c, u32 initval) { a += initval; b += initval; c += initval; __jhash_final(a, b, c); return c; } static inline u32 jhash_3words(u32 a, u32 b, u32 c, u32 initval) { return __jhash_nwords(a, b, c, initval + JHASH_INITVAL + (3 << 2)); } static inline u32 jhash_2words(u32 a, u32 b, u32 initval) { return __jhash_nwords(a, b, 0, initval + JHASH_INITVAL + (2 << 2)); } static inline u32 jhash_1word(u32 a, u32 initval) { return __jhash_nwords(a, 0, 0, initval + JHASH_INITVAL + (1 << 2)); } #endif /* _LINUX_JHASH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Queued spinlock * * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * * Authors: Waiman Long <waiman.long@hpe.com> */ #ifndef __ASM_GENERIC_QSPINLOCK_H #define __ASM_GENERIC_QSPINLOCK_H #include <asm-generic/qspinlock_types.h> #include <linux/atomic.h> #ifndef queued_spin_is_locked /** * queued_spin_is_locked - is the spinlock locked? * @lock: Pointer to queued spinlock structure * Return: 1 if it is locked, 0 otherwise */ static __always_inline int queued_spin_is_locked(struct qspinlock *lock) { /* * Any !0 state indicates it is locked, even if _Q_LOCKED_VAL * isn't immediately observable. */ return atomic_read(&lock->val); } #endif /** * queued_spin_value_unlocked - is the spinlock structure unlocked? * @lock: queued spinlock structure * Return: 1 if it is unlocked, 0 otherwise * * N.B. Whenever there are tasks waiting for the lock, it is considered * locked wrt the lockref code to avoid lock stealing by the lockref * code and change things underneath the lock. This also allows some * optimizations to be applied without conflict with lockref. */ static __always_inline int queued_spin_value_unlocked(struct qspinlock lock) { return !atomic_read(&lock.val); } /** * queued_spin_is_contended - check if the lock is contended * @lock : Pointer to queued spinlock structure * Return: 1 if lock contended, 0 otherwise */ static __always_inline int queued_spin_is_contended(struct qspinlock *lock) { return atomic_read(&lock->val) & ~_Q_LOCKED_MASK; } /** * queued_spin_trylock - try to acquire the queued spinlock * @lock : Pointer to queued spinlock structure * Return: 1 if lock acquired, 0 if failed */ static __always_inline int queued_spin_trylock(struct qspinlock *lock) { u32 val = atomic_read(&lock->val); if (unlikely(val)) return 0; return likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL)); } extern void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); #ifndef queued_spin_lock /** * queued_spin_lock - acquire a queued spinlock * @lock: Pointer to queued spinlock structure */ static __always_inline void queued_spin_lock(struct qspinlock *lock) { u32 val = 0; if (likely(atomic_try_cmpxchg_acquire(&lock->val, &val, _Q_LOCKED_VAL))) return; queued_spin_lock_slowpath(lock, val); } #endif #ifndef queued_spin_unlock /** * queued_spin_unlock - release a queued spinlock * @lock : Pointer to queued spinlock structure */ static __always_inline void queued_spin_unlock(struct qspinlock *lock) { /* * unlock() needs release semantics: */ smp_store_release(&lock->locked, 0); } #endif #ifndef virt_spin_lock static __always_inline bool virt_spin_lock(struct qspinlock *lock) { return false; } #endif /* * Remapping spinlock architecture specific functions to the corresponding * queued spinlock functions. */ #define arch_spin_is_locked(l) queued_spin_is_locked(l) #define arch_spin_is_contended(l) queued_spin_is_contended(l) #define arch_spin_value_unlocked(l) queued_spin_value_unlocked(l) #define arch_spin_lock(l) queued_spin_lock(l) #define arch_spin_trylock(l) queued_spin_trylock(l) #define arch_spin_unlock(l) queued_spin_unlock(l) #endif /* __ASM_GENERIC_QSPINLOCK_H */
5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 // SPDX-License-Identifier: GPL-2.0 // Generated by scripts/atomic/gen-atomic-fallback.sh // DO NOT MODIFY THIS FILE DIRECTLY #ifndef _LINUX_ATOMIC_FALLBACK_H #define _LINUX_ATOMIC_FALLBACK_H #include <linux/compiler.h>