1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ioctl.c * * Copyright (C) 1991, 1992 Linus Torvalds */ #include <linux/syscalls.h> #include <linux/mm.h> #include <linux/capability.h> #include <linux/compat.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/security.h> #include <linux/export.h> #include <linux/uaccess.h> #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> #include <linux/sched/signal.h> #include <linux/fiemap.h> #include "internal.h" #include <asm/ioctls.h> /* So that the fiemap access checks can't overflow on 32 bit machines. */ #define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) /** * vfs_ioctl - call filesystem specific ioctl methods * @filp: open file to invoke ioctl method on * @cmd: ioctl command to execute * @arg: command-specific argument for ioctl * * Invokes filesystem specific ->unlocked_ioctl, if one exists; otherwise * returns -ENOTTY. * * Returns 0 on success, -errno on error. */ long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; if (!filp->f_op->unlocked_ioctl) goto out; error = filp->f_op->unlocked_ioctl(filp, cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; out: return error; } EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { struct inode *inode = file_inode(filp); struct super_block *sb = inode->i_sb; int error, ur_block; sector_t block; if (!capable(CAP_SYS_RAWIO)) return -EPERM; error = get_user(ur_block, p); if (error) return error; if (ur_block < 0) return -EINVAL; block = ur_block; error = bmap(inode, &block); if (block > INT_MAX) { error = -ERANGE; pr_warn_ratelimited("[%s/%d] FS: %s File: %pD4 would truncate fibmap result\n", current->comm, task_pid_nr(current), sb->s_id, filp); } if (error) ur_block = 0; else ur_block = block; if (put_user(ur_block, p)) error = -EFAULT; return error; } /** * fiemap_fill_next_extent - Fiemap helper function * @fieinfo: Fiemap context passed into ->fiemap * @logical: Extent logical start offset, in bytes * @phys: Extent physical start offset, in bytes * @len: Extent length, in bytes * @flags: FIEMAP_EXTENT flags that describe this extent * * Called from file system ->fiemap callback. Will populate extent * info as passed in via arguments and copy to user memory. On * success, extent count on fieinfo is incremented. * * Returns 0 on success, -errno on error, 1 if this was the last * extent that will fit in user array. */ #define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) #define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) #define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, u64 phys, u64 len, u32 flags) { struct fiemap_extent extent; struct fiemap_extent __user *dest = fieinfo->fi_extents_start; /* only count the extents */ if (fieinfo->fi_extents_max == 0) { fieinfo->fi_extents_mapped++; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) return 1; if (flags & SET_UNKNOWN_FLAGS) flags |= FIEMAP_EXTENT_UNKNOWN; if (flags & SET_NO_UNMOUNTED_IO_FLAGS) flags |= FIEMAP_EXTENT_ENCODED; if (flags & SET_NOT_ALIGNED_FLAGS) flags |= FIEMAP_EXTENT_NOT_ALIGNED; memset(&extent, 0, sizeof(extent)); extent.fe_logical = logical; extent.fe_physical = phys; extent.fe_length = len; extent.fe_flags = flags; dest += fieinfo->fi_extents_mapped; if (copy_to_user(dest, &extent, sizeof(extent))) return -EFAULT; fieinfo->fi_extents_mapped++; if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) return 1; return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; } EXPORT_SYMBOL(fiemap_fill_next_extent); /** * fiemap_prep - check validity of requested flags for fiemap * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap * @start: Start of the mapped range * @len: Length of the mapped range, can be truncated by this function. * @supported_flags: Set of fiemap flags that the file system understands * * This function must be called from each ->fiemap instance to validate the * fiemap request against the file system parameters. * * Returns 0 on success, or a negative error on failure. */ int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 *len, u32 supported_flags) { u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; int ret = 0; if (*len == 0) return -EINVAL; if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ if (*len > maxbytes || (maxbytes - *len) < start) *len = maxbytes - start; supported_flags |= FIEMAP_FLAG_SYNC; supported_flags &= FIEMAP_FLAGS_COMPAT; incompat_flags = fieinfo->fi_flags & ~supported_flags; if (incompat_flags) { fieinfo->fi_flags = incompat_flags; return -EBADR; } if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) ret = filemap_write_and_wait(inode->i_mapping); return ret; } EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); int error; if (!inode->i_op->fiemap) return -EOPNOTSUPP; if (copy_from_user(&fiemap, ufiemap, sizeof(fiemap))) return -EFAULT; if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) error = -EFAULT; return error; } static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { struct fd src_file = fdget(srcfd); loff_t cloned; int ret; if (!src_file.file) return -EBADF; ret = -EXDEV; if (src_file.file->f_path.mnt != dst_file->f_path.mnt) goto fdput; cloned = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen, 0); if (cloned < 0) ret = cloned; else if (olen && cloned != olen) ret = -EINVAL; else ret = 0; fdput: fdput(src_file); return ret; } static long ioctl_file_clone_range(struct file *file, struct file_clone_range __user *argp) { struct file_clone_range args; if (copy_from_user(&args, argp, sizeof(args))) return -EFAULT; return ioctl_file_clone(file, args.src_fd, args.src_offset, args.src_length, args.dest_offset); } #ifdef CONFIG_BLOCK static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) { return (offset >> inode->i_blkbits); } static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) { return (blk << inode->i_blkbits); } /** * __generic_block_fiemap - FIEMAP for block based inodes (no locking) * @inode: the inode to map * @fieinfo: the fiemap info struct that will be passed back to userspace * @start: where to start mapping in the inode * @len: how much space to map * @get_block: the fs's get_block function * * This does FIEMAP for block based inodes. Basically it will just loop * through get_block until we hit the number of extents we want to map, or we * go past the end of the file and hit a hole. * * If it is possible to have data blocks beyond a hole past @inode->i_size, then * please do not use this function, it will stop at the first unmapped block * beyond i_size. * * If you use this function directly, you need to do your own locking. Use * generic_block_fiemap if you want the locking done for you. */ static int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, get_block_t *get_block) { struct buffer_head map_bh; sector_t start_blk, last_blk; loff_t isize = i_size_read(inode); u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; /* * Either the i_mutex or other appropriate locking needs to be held * since we expect isize to not change at all through the duration of * this call. */ if (len >= isize) { whole_file = true; len = isize; } /* * Some filesystems can't deal with being asked to map less than * blocksize, so make sure our len is at least block length. */ if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_block(inode, start_blk, &map_bh, 0); if (ret) break; /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk++; /* * We want to handle the case where there is an * allocated block at the front of the file, and then * nothing but holes up to the end of the file properly, * to make sure that extent at the front gets properly * marked with FIEMAP_EXTENT_LAST */ if (!past_eof && blk_to_logical(inode, start_blk) >= isize) past_eof = 1; /* * First hole after going past the EOF, this is our * last extent */ if (past_eof && size) { flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); } else if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); size = 0; } /* if we have holes up to/past EOF then we're done */ if (start_blk > last_blk || past_eof || ret) break; } else { /* * We have gone over the length of what we wanted to * map, and it wasn't the entire file, so add the extent * we got last time and exit. * * This is for the case where say we want to map all the * way up to the second to the last block in a file, but * the last block is a hole, making the second to last * block FIEMAP_EXTENT_LAST. In this case we want to * see if there is a hole after the second to last block * so we can mark it properly. If we found data after * we exceeded the length we were requesting, then we * are good to go, just add the extent to the fieinfo * and break */ if (start_blk > last_blk && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); break; } /* * if size != 0 then we know we already have an extent * to add, so add it. */ if (size) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); if (ret) break; } logical = blk_to_logical(inode, start_blk); phys = blk_to_logical(inode, map_bh.b_blocknr); size = map_bh.b_size; flags = FIEMAP_EXTENT_MERGED; start_blk += logical_to_blk(inode, size); /* * If we are past the EOF, then we need to make sure as * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ if (!past_eof && logical + size >= isize) past_eof = true; } cond_resched(); if (fatal_signal_pending(current)) { ret = -EINTR; break; } } while (1); /* If ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; return ret; } /** * generic_block_fiemap - FIEMAP for block based inodes * @inode: The inode to map * @fieinfo: The mapping information * @start: The initial block to map * @len: The length of the extect to attempt to map * @get_block: The block mapping function for the fs * * Calls __generic_block_fiemap to map the inode, after taking * the inode's mutex lock. */ int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block) { int ret; inode_lock(inode); ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); inode_unlock(inode); return ret; } EXPORT_SYMBOL(generic_block_fiemap); #endif /* CONFIG_BLOCK */ /* * This provides compatibility with legacy XFS pre-allocation ioctls * which predate the fallocate syscall. * * Only the l_start, l_len and l_whence fields of the 'struct space_resv' * are used here, rest are ignored. */ static int ioctl_preallocate(struct file *filp, int mode, void __user *argp) { struct inode *inode = file_inode(filp); struct space_resv sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += filp->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(filp, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } /* on ia32 l_start is on a 32-bit boundary */ #if defined CONFIG_COMPAT && defined(CONFIG_X86_64) /* just account for different alignment */ static int compat_ioctl_preallocate(struct file *file, int mode, struct space_resv_32 __user *argp) { struct inode *inode = file_inode(file); struct space_resv_32 sr; if (copy_from_user(&sr, argp, sizeof(sr))) return -EFAULT; switch (sr.l_whence) { case SEEK_SET: break; case SEEK_CUR: sr.l_start += file->f_pos; break; case SEEK_END: sr.l_start += i_size_read(inode); break; default: return -EINVAL; } return vfs_fallocate(file, mode | FALLOC_FL_KEEP_SIZE, sr.l_start, sr.l_len); } #endif static int file_ioctl(struct file *filp, unsigned int cmd, int __user *p) { switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); case FS_IOC_RESVSP: case FS_IOC_RESVSP64: return ioctl_preallocate(filp, 0, p); case FS_IOC_UNRESVSP: case FS_IOC_UNRESVSP64: return ioctl_preallocate(filp, FALLOC_FL_PUNCH_HOLE, p); case FS_IOC_ZERO_RANGE: return ioctl_preallocate(filp, FALLOC_FL_ZERO_RANGE, p); } return -ENOIOCTLCMD; } static int ioctl_fionbio(struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = O_NONBLOCK; #ifdef __sparc__ /* SunOS compatibility item. */ if (O_NONBLOCK != O_NDELAY) flag |= O_NDELAY; #endif spin_lock(&filp->f_lock); if (on) filp->f_flags |= flag; else filp->f_flags &= ~flag; spin_unlock(&filp->f_lock); return error; } static int ioctl_fioasync(unsigned int fd, struct file *filp, int __user *argp) { unsigned int flag; int on, error; error = get_user(on, argp); if (error) return error; flag = on ? FASYNC : 0; /* Did FASYNC state change ? */ if ((flag ^ filp->f_flags) & FASYNC) { if (filp->f_op->fasync) /* fasync() adjusts filp->f_flags */ error = filp->f_op->fasync(fd, filp, on); else error = -ENOTTY; } return error < 0 ? error : 0; } static int ioctl_fsfreeze(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* If filesystem doesn't support freeze feature, return. */ if (sb->s_op->freeze_fs == NULL && sb->s_op->freeze_super == NULL) return -EOPNOTSUPP; /* Freeze */ if (sb->s_op->freeze_super) return sb->s_op->freeze_super(sb); return freeze_super(sb); } static int ioctl_fsthaw(struct file *filp) { struct super_block *sb = file_inode(filp)->i_sb; if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; /* Thaw */ if (sb->s_op->thaw_super) return sb->s_op->thaw_super(sb); return thaw_super(sb); } static int ioctl_file_dedupe_range(struct file *file, struct file_dedupe_range __user *argp) { struct file_dedupe_range *same = NULL; int ret; unsigned long size; u16 count; if (get_user(count, &argp->dest_count)) { ret = -EFAULT; goto out; } size = offsetof(struct file_dedupe_range __user, info[count]); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; } same = memdup_user(argp, size); if (IS_ERR(same)) { ret = PTR_ERR(same); same = NULL; goto out; } same->dest_count = count; ret = vfs_dedupe_file_range(file, same); if (ret) goto out; ret = copy_to_user(argp, same, size); if (ret) ret = -EFAULT; out: kfree(same); return ret; } /* * do_vfs_ioctl() is not for drivers and not intended to be EXPORT_SYMBOL()'d. * It's just a simple helper for sys_ioctl and compat_sys_ioctl. * * When you add any new common ioctls to the switches above and below, * please ensure they have compatible arguments in compat mode. */ static int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct inode *inode = file_inode(filp); switch (cmd) { case FIOCLEX: set_close_on_exec(fd, 1); return 0; case FIONCLEX: set_close_on_exec(fd, 0); return 0; case FIONBIO: return ioctl_fionbio(filp, argp); case FIOASYNC: return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); return copy_to_user(argp, &res, sizeof(res)) ? -EFAULT : 0; } return -ENOTTY; case FIFREEZE: return ioctl_fsfreeze(filp); case FITHAW: return ioctl_fsthaw(filp); case FS_IOC_FIEMAP: return ioctl_fiemap(filp, argp); case FIGETBSZ: /* anon_bdev filesystems may not have a block size */ if (!inode->i_sb->s_blocksize) return -EINVAL; return put_user(inode->i_sb->s_blocksize, (int __user *)argp); case FICLONE: return ioctl_file_clone(filp, arg, 0, 0, 0); case FICLONERANGE: return ioctl_file_clone_range(filp, argp); case FIDEDUPERANGE: return ioctl_file_dedupe_range(filp, argp); case FIONREAD: if (!S_ISREG(inode->i_mode)) return vfs_ioctl(filp, cmd, arg); return put_user(i_size_read(inode) - filp->f_pos, (int __user *)argp); default: if (S_ISREG(inode->i_mode)) return file_ioctl(filp, cmd, argp); break; } return -ENOIOCTLCMD; } SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, unsigned long, arg) { struct fd f = fdget(fd); int error; if (!f.file) return -EBADF; error = security_file_ioctl(f.file, cmd, arg); if (error) goto out; error = do_vfs_ioctl(f.file, fd, cmd, arg); if (error == -ENOIOCTLCMD) error = vfs_ioctl(f.file, cmd, arg); out: fdput(f); return error; } #ifdef CONFIG_COMPAT /** * compat_ptr_ioctl - generic implementation of .compat_ioctl file operation * * This is not normally called as a function, but instead set in struct * file_operations as * * .compat_ioctl = compat_ptr_ioctl, * * On most architectures, the compat_ptr_ioctl() just passes all arguments * to the corresponding ->ioctl handler. The exception is arch/s390, where * compat_ptr() clears the top bit of a 32-bit pointer value, so user space * pointers to the second 2GB alias the first 2GB, as is the case for * native 32-bit s390 user space. * * The compat_ptr_ioctl() function must therefore be used only with ioctl * functions that either ignore the argument or pass a pointer to a * compatible data type. * * If any ioctl command handled by fops->unlocked_ioctl passes a plain * integer instead of a pointer, or any of the passed data types * is incompatible between 32-bit and 64-bit architectures, a proper * handler is required instead of compat_ptr_ioctl. */ long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { if (!file->f_op->unlocked_ioctl) return -ENOIOCTLCMD; return file->f_op->unlocked_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } EXPORT_SYMBOL(compat_ptr_ioctl); COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, compat_ulong_t, arg) { struct fd f = fdget(fd); int error; if (!f.file) return -EBADF; /* RED-PEN how should LSM module know it's handling 32bit? */ error = security_file_ioctl(f.file, cmd, arg); if (error) goto out; switch (cmd) { /* FICLONE takes an int argument, so don't use compat_ptr() */ case FICLONE: error = ioctl_file_clone(f.file, arg, 0, 0, 0); break; #if defined(CONFIG_X86_64) /* these get messy on amd64 due to alignment differences */ case FS_IOC_RESVSP_32: case FS_IOC_RESVSP64_32: error = compat_ioctl_preallocate(f.file, 0, compat_ptr(arg)); break; case FS_IOC_UNRESVSP_32: case FS_IOC_UNRESVSP64_32: error = compat_ioctl_preallocate(f.file, FALLOC_FL_PUNCH_HOLE, compat_ptr(arg)); break; case FS_IOC_ZERO_RANGE_32: error = compat_ioctl_preallocate(f.file, FALLOC_FL_ZERO_RANGE, compat_ptr(arg)); break; #endif /* * everything else in do_vfs_ioctl() takes either a compatible * pointer argument or no argument -- call it with a modified * argument. */ default: error = do_vfs_ioctl(f.file, fd, cmd, (unsigned long)compat_ptr(arg)); if (error != -ENOIOCTLCMD) break; if (f.file->f_op->compat_ioctl) error = f.file->f_op->compat_ioctl(f.file, cmd, arg); if (error == -ENOIOCTLCMD) error = -ENOTTY; break; } out: fdput(f); return error; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2009-2019 Christoph Hellwig * * NOTE: none of these tracepoints shall be consider a stable kernel ABI * as they can change at any time. */ #undef TRACE_SYSTEM #define TRACE_SYSTEM iomap #if !defined(_IOMAP_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) #define _IOMAP_TRACE_H #include <linux/tracepoint.h> struct inode; DECLARE_EVENT_CLASS(iomap_readpage_class, TP_PROTO(struct inode *inode, int nr_pages), TP_ARGS(inode, nr_pages), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(int, nr_pages) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_pages = nr_pages; ), TP_printk("dev %d:%d ino 0x%llx nr_pages %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->nr_pages) ) #define DEFINE_READPAGE_EVENT(name) \ DEFINE_EVENT(iomap_readpage_class, name, \ TP_PROTO(struct inode *inode, int nr_pages), \ TP_ARGS(inode, nr_pages)) DEFINE_READPAGE_EVENT(iomap_readpage); DEFINE_READPAGE_EVENT(iomap_readahead); DECLARE_EVENT_CLASS(iomap_range_class, TP_PROTO(struct inode *inode, unsigned long off, unsigned int len), TP_ARGS(inode, off, len), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, size) __field(unsigned long, offset) __field(unsigned int, length) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->size = i_size_read(inode); __entry->offset = off; __entry->length = len; ), TP_printk("dev %d:%d ino 0x%llx size 0x%llx offset %lx " "length %x", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->size, __entry->offset, __entry->length) ) #define DEFINE_RANGE_EVENT(name) \ DEFINE_EVENT(iomap_range_class, name, \ TP_PROTO(struct inode *inode, unsigned long off, unsigned int len),\ TP_ARGS(inode, off, len)) DEFINE_RANGE_EVENT(iomap_writepage); DEFINE_RANGE_EVENT(iomap_releasepage); DEFINE_RANGE_EVENT(iomap_invalidatepage); DEFINE_RANGE_EVENT(iomap_dio_invalidate_fail); #define IOMAP_TYPE_STRINGS \ { IOMAP_HOLE, "HOLE" }, \ { IOMAP_DELALLOC, "DELALLOC" }, \ { IOMAP_MAPPED, "MAPPED" }, \ { IOMAP_UNWRITTEN, "UNWRITTEN" }, \ { IOMAP_INLINE, "INLINE" } #define IOMAP_FLAGS_STRINGS \ { IOMAP_WRITE, "WRITE" }, \ { IOMAP_ZERO, "ZERO" }, \ { IOMAP_REPORT, "REPORT" }, \ { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ { IOMAP_F_DIRTY, "DIRTY" }, \ { IOMAP_F_SHARED, "SHARED" }, \ { IOMAP_F_MERGED, "MERGED" }, \ { IOMAP_F_BUFFER_HEAD, "BH" }, \ { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } DECLARE_EVENT_CLASS(iomap_class, TP_PROTO(struct inode *inode, struct iomap *iomap), TP_ARGS(inode, iomap), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(u64, addr) __field(loff_t, offset) __field(u64, length) __field(u16, type) __field(u16, flags) __field(dev_t, bdev) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->addr = iomap->addr; __entry->offset = iomap->offset; __entry->length = iomap->length; __entry->type = iomap->type; __entry->flags = iomap->flags; __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr %lld offset %lld " "length %llu type %s flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), __entry->addr, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) ) #define DEFINE_IOMAP_EVENT(name) \ DEFINE_EVENT(iomap_class, name, \ TP_PROTO(struct inode *inode, struct iomap *iomap), \ TP_ARGS(inode, iomap)) DEFINE_IOMAP_EVENT(iomap_apply_dstmap); DEFINE_IOMAP_EVENT(iomap_apply_srcmap); TRACE_EVENT(iomap_apply, TP_PROTO(struct inode *inode, loff_t pos, loff_t length, unsigned int flags, const void *ops, void *actor, unsigned long caller), TP_ARGS(inode, pos, length, flags, ops, actor, caller), TP_STRUCT__entry( __field(dev_t, dev) __field(u64, ino) __field(loff_t, pos) __field(loff_t, length) __field(unsigned int, flags) __field(const void *, ops) __field(void *, actor) __field(unsigned long, caller) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->length = length; __entry->flags = flags; __entry->ops = ops; __entry->actor = actor; __entry->caller = caller; ), TP_printk("dev %d:%d ino 0x%llx pos %lld length %lld flags %s (0x%x) " "ops %ps caller %pS actor %ps", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, (void *)__entry->caller, __entry->actor) ); #endif /* _IOMAP_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE trace #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Fast and scalable bitmaps. * * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #ifndef __LINUX_SCALE_BITMAP_H #define __LINUX_SCALE_BITMAP_H #include <linux/kernel.h> #include <linux/slab.h> struct seq_file; /** * struct sbitmap_word - Word in a &struct sbitmap. */ struct sbitmap_word { /** * @depth: Number of bits being used in @word/@cleared */ unsigned long depth; /** * @word: word holding free bits */ unsigned long word ____cacheline_aligned_in_smp; /** * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; /** * @swap_lock: Held while swapping word <-> cleared */ spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** * struct sbitmap - Scalable bitmap. * * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This * trades off higher memory usage for better scalability. */ struct sbitmap { /** * @depth: Number of bits used in the whole bitmap. */ unsigned int depth; /** * @shift: log2(number of bits used per word) */ unsigned int shift; /** * @map_nr: Number of words (cachelines) being used for the bitmap. */ unsigned int map_nr; /** * @map: Allocated bitmap. */ struct sbitmap_word *map; }; #define SBQ_WAIT_QUEUES 8 #define SBQ_WAKE_BATCH 8 /** * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { /** * @wait_cnt: Number of frees remaining before we wake up. */ atomic_t wait_cnt; /** * @wait: Wait queue. */ wait_queue_head_t wait; } ____cacheline_aligned_in_smp; /** * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free * bits. * * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to * avoid contention on the wait queue spinlock. This ensures that we don't hit a * scalability wall when we run out of free bits and have to start putting tasks * to sleep. */ struct sbitmap_queue { /** * @sb: Scalable bitmap. */ struct sbitmap sb; /* * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different * cachelines until the map is exhausted. */ unsigned int __percpu *alloc_hint; /** * @wake_batch: Number of bits which must be freed before we wake up any * waiters. */ unsigned int wake_batch; /** * @wake_index: Next wait queue in @ws to wake up. */ atomic_t wake_index; /** * @ws: Wait queues. */ struct sbq_wait_state *ws; /* * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; /** * @round_robin: Allocate bits in strict round-robin order. */ bool round_robin; /** * @min_shallow_depth: The minimum shallow depth which may be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). */ unsigned int min_shallow_depth; }; /** * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. * @sb: Bitmap to initialize. * @depth: Number of bits to allocate. * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if * given, a good default is chosen. * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node); /** * sbitmap_free() - Free memory used by a &struct sbitmap. * @sb: Bitmap to free. */ static inline void sbitmap_free(struct sbitmap *sb) { kfree(sb->map); sb->map = NULL; } /** * sbitmap_resize() - Resize a &struct sbitmap. * @sb: Bitmap to resize. * @depth: New number of bits to resize to. * * Doesn't reallocate anything. It's up to the caller to ensure that the new * depth doesn't exceed the depth that the sb was initialized with. */ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); /** * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. * @sb: Bitmap to allocate from. * @alloc_hint: Hint for where to start searching for a free bit. * @round_robin: If true, be stricter about allocation order; always allocate * starting from the last allocated bit. This is less efficient * than the default behavior (false). * * This operation provides acquire barrier semantics if it succeeds. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); /** * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, * limiting the depth used from each word. * @sb: Bitmap to allocate from. * @alloc_hint: Hint for where to start searching for a free bit. * @shallow_depth: The maximum number of bits to allocate from a single word. * * This rather specific operation allows for having multiple users with * different allocation limits. E.g., there can be a high-priority class that * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority * class can only allocate half of the total bits in the bitmap, preventing it * from starving out the high-priority class. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth); /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. * * Return: true if any bit in the bitmap is set, false otherwise. */ bool sbitmap_any_bit_set(const struct sbitmap *sb); #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. * * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ static inline void __sbitmap_for_each_set(struct sbitmap *sb, unsigned int start, sb_for_each_fn fn, void *data) { unsigned int index; unsigned int nr; unsigned int scanned = 0; if (start >= sb->depth) start = 0; index = SB_NR_TO_INDEX(sb, start); nr = SB_NR_TO_BIT(sb, start); while (scanned < sb->depth) { unsigned long word; unsigned int depth = min_t(unsigned int, sb->map[index].depth - nr, sb->depth - scanned); scanned += depth; word = sb->map[index].word & ~sb->map[index].cleared; if (!word) goto next; /* * On the first iteration of the outer loop, we need to add the * bit offset back to the size of the word for find_next_bit(). * On all other iterations, nr is zero, so this is a noop. */ depth += nr; while (1) { nr = find_next_bit(&word, depth, nr); if (nr >= depth) break; if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } next: nr = 0; if (++index >= sb->map_nr) index = 0; } } /** * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. */ static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, void *data) { __sbitmap_for_each_set(sb, 0, fn, data); } static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) { return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; } /* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) { set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) { clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) { unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; set_bit(SB_NR_TO_BIT(sb, bitnr), addr); } static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, unsigned int bitnr) { clear_bit_unlock(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) { return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /** * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct * seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The output isn't guaranteed to be internally * consistent. */ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. * @sbq: Bitmap queue to initialize. * @depth: See sbitmap_init_node(). * @shift: See sbitmap_init_node(). * @round_robin: See sbitmap_get(). * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node); /** * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. * * @sbq: Bitmap queue to free. */ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) { kfree(sbq->ws); free_percpu(sbq->alloc_hint); sbitmap_free(&sbq->sb); } /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. * @depth: New number of bits to resize to. * * Like sbitmap_resize(), this doesn't reallocate anything. It has to do * some extra work on the &struct sbitmap_queue, so it's not safe to just * resize the underlying &struct sbitmap. */ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); /** * __sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue with preemption already disabled. * @sbq: Bitmap queue to allocate from. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); /** * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption * already disabled. * @sbq: Bitmap queue to allocate from. * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth); /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, unsigned int *cpu) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get(sbq); put_cpu(); return nr; } /** * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int *cpu, unsigned int shallow_depth) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get_shallow(sbq, shallow_depth); put_cpu(); return nr; } /** * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the * minimum shallow depth that will be used. * @sbq: Bitmap queue in question. * @min_shallow_depth: The minimum shallow depth that will be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). * * sbitmap_queue_clear() batches wakeups as an optimization. The batch size * depends on the depth of the bitmap. Since the shallow allocation functions * effectively operate with a different depth, the shallow depth must be taken * into account when calculating the batch size. This function must be called * with the minimum shallow depth that will be used. Failure to do so can result * in missed wakeups. */ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth); /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @nr: Bit number to free. * @cpu: CPU the bit was allocated on. */ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); } static inline void sbq_index_atomic_inc(atomic_t *index) { int old = atomic_read(index); int new = sbq_index_inc(old); atomic_cmpxchg(index, old, new); } /** * sbq_wait_ptr() - Get the next wait queue to use for a &struct * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) { struct sbq_wait_state *ws; ws = &sbq->ws[atomic_read(wait_index)]; sbq_index_atomic_inc(wait_index); return ws; } /** * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct * sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); /** * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct * seq_file. * @sbq: Bitmap queue to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); struct sbq_wait { struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */ struct wait_queue_entry wait; }; #define DEFINE_SBQ_WAIT(name) \ struct sbq_wait name = { \ .sbq = NULL, \ .wait = { \ .private = current, \ .func = autoremove_wake_function, \ .entry = LIST_HEAD_INIT((name).wait.entry), \ } \ } /* * Wrapper around prepare_to_wait_exclusive(), which maintains some extra * internal state. */ void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state); /* * Must be paired with sbitmap_prepare_to_wait(). */ void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Wrapper around add_wait_queue(), which maintains some extra internal state */ void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Must be paired with sbitmap_add_wait_queue() */ void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait); #endif /* __LINUX_SCALE_BITMAP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BH_H #define _LINUX_BH_H #include <linux/preempt.h> #ifdef CONFIG_TRACE_IRQFLAGS extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { preempt_count_add(cnt); barrier(); } #endif static inline void local_bh_disable(void) { __local_bh_disable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } extern void _local_bh_enable(void); extern void __local_bh_enable_ip(unsigned long ip, unsigned int cnt); static inline void local_bh_enable_ip(unsigned long ip) { __local_bh_enable_ip(ip, SOFTIRQ_DISABLE_OFFSET); } static inline void local_bh_enable(void) { __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } #endif /* _LINUX_BH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 // SPDX-License-Identifier: GPL-2.0 /* * kobject.h - generic kernel object infrastructure. * * Copyright (c) 2002-2003 Patrick Mochel * Copyright (c) 2002-2003 Open Source Development Labs * Copyright (c) 2006-2008 Greg Kroah-Hartman <greg@kroah.com> * Copyright (c) 2006-2008 Novell Inc. * * Please read Documentation/core-api/kobject.rst before using the kobject * interface, ESPECIALLY the parts about reference counts and object * destructors. */ #ifndef _KOBJECT_H_ #define _KOBJECT_H_ #include <linux/types.h> #include <linux/list.h> #include <linux/sysfs.h> #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/kref.h> #include <linux/kobject_ns.h> #include <linux/kernel.h> #include <linux/wait.h> #include <linux/atomic.h> #include <linux/workqueue.h> #include <linux/uidgid.h> #define UEVENT_HELPER_PATH_LEN 256 #define UEVENT_NUM_ENVP 64 /* number of env pointers */ #define UEVENT_BUFFER_SIZE 2048 /* buffer for the variables */ #ifdef CONFIG_UEVENT_HELPER /* path to the userspace helper executed on an event */ extern char uevent_helper[]; #endif /* counter to tag the uevent, read only except for the kobject core */ extern u64 uevent_seqnum; /* * The actions here must match the index to the string array * in lib/kobject_uevent.c * * Do not add new actions here without checking with the driver-core * maintainers. Action strings are not meant to express subsystem * or device specific properties. In most cases you want to send a * kobject_uevent_env(kobj, KOBJ_CHANGE, env) with additional event * specific variables added to the event environment. */ enum kobject_action { KOBJ_ADD, KOBJ_REMOVE, KOBJ_CHANGE, KOBJ_MOVE, KOBJ_ONLINE, KOBJ_OFFLINE, KOBJ_BIND, KOBJ_UNBIND, }; struct kobject { const char *name; struct list_head entry; struct kobject *parent; struct kset *kset; struct kobj_type *ktype; struct kernfs_node *sd; /* sysfs directory entry */ struct kref kref; #ifdef CONFIG_DEBUG_KOBJECT_RELEASE struct delayed_work release; #endif unsigned int state_initialized:1; unsigned int state_in_sysfs:1; unsigned int state_add_uevent_sent:1; unsigned int state_remove_uevent_sent:1; unsigned int uevent_suppress:1; }; extern __printf(2, 3) int kobject_set_name(struct kobject *kobj, const char *name, ...); extern __printf(2, 0) int kobject_set_name_vargs(struct kobject *kobj, const char *fmt, va_list vargs); static inline const char *kobject_name(const struct kobject *kobj) { return kobj->name; } extern void kobject_init(struct kobject *kobj, struct kobj_type *ktype); extern __printf(3, 4) __must_check int kobject_add(struct kobject *kobj, struct kobject *parent, const char *fmt, ...); extern __printf(4, 5) __must_check int kobject_init_and_add(struct kobject *kobj, struct kobj_type *ktype, struct kobject *parent, const char *fmt, ...); extern void kobject_del(struct kobject *kobj); extern struct kobject * __must_check kobject_create(void); extern struct kobject * __must_check kobject_create_and_add(const char *name, struct kobject *parent); extern int __must_check kobject_rename(struct kobject *, const char *new_name); extern int __must_check kobject_move(struct kobject *, struct kobject *); extern struct kobject *kobject_get(struct kobject *kobj); extern struct kobject * __must_check kobject_get_unless_zero( struct kobject *kobj); extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); extern void kobject_get_ownership(struct kobject *kobj, kuid_t *uid, kgid_t *gid); extern char *kobject_get_path(struct kobject *kobj, gfp_t flag); /** * kobject_has_children - Returns whether a kobject has children. * @kobj: the object to test * * This will return whether a kobject has other kobjects as children. * * It does NOT account for the presence of attribute files, only sub * directories. It also assumes there is no concurrent addition or * removal of such children, and thus relies on external locking. */ static inline bool kobject_has_children(struct kobject *kobj) { WARN_ON_ONCE(kref_read(&kobj->kref) == 0); return kobj->sd && kobj->sd->dir.subdirs; } struct kobj_type { void (*release)(struct kobject *kobj); const struct sysfs_ops *sysfs_ops; struct attribute **default_attrs; /* use default_groups instead */ const struct attribute_group **default_groups; const struct kobj_ns_type_operations *(*child_ns_type)(struct kobject *kobj); const void *(*namespace)(struct kobject *kobj); void (*get_ownership)(struct kobject *kobj, kuid_t *uid, kgid_t *gid); }; struct kobj_uevent_env { char *argv[3]; char *envp[UEVENT_NUM_ENVP]; int envp_idx; char buf[UEVENT_BUFFER_SIZE]; int buflen; }; struct kset_uevent_ops { int (* const filter)(struct kset *kset, struct kobject *kobj); const char *(* const name)(struct kset *kset, struct kobject *kobj); int (* const uevent)(struct kset *kset, struct kobject *kobj, struct kobj_uevent_env *env); }; struct kobj_attribute { struct attribute attr; ssize_t (*show)(struct kobject *kobj, struct kobj_attribute *attr, char *buf); ssize_t (*store)(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count); }; extern const struct sysfs_ops kobj_sysfs_ops; struct sock; /** * struct kset - a set of kobjects of a specific type, belonging to a specific subsystem. * * A kset defines a group of kobjects. They can be individually * different "types" but overall these kobjects all want to be grouped * together and operated on in the same manner. ksets are used to * define the attribute callbacks and other common events that happen to * a kobject. * * @list: the list of all kobjects for this kset * @list_lock: a lock for iterating over the kobjects * @kobj: the embedded kobject for this kset (recursion, isn't it fun...) * @uevent_ops: the set of uevent operations for this kset. These are * called whenever a kobject has something happen to it so that the kset * can add new environment variables, or filter out the uevents if so * desired. */ struct kset { struct list_head list; spinlock_t list_lock; struct kobject kobj; const struct kset_uevent_ops *uevent_ops; } __randomize_layout; extern void kset_init(struct kset *kset); extern int __must_check kset_register(struct kset *kset); extern void kset_unregister(struct kset *kset); extern struct kset * __must_check kset_create_and_add(const char *name, const struct kset_uevent_ops *u, struct kobject *parent_kobj); static inline struct kset *to_kset(struct kobject *kobj) { return kobj ? container_of(kobj, struct kset, kobj) : NULL; } static inline struct kset *kset_get(struct kset *k) { return k ? to_kset(kobject_get(&k->kobj)) : NULL; } static inline void kset_put(struct kset *k) { kobject_put(&k->kobj); } static inline struct kobj_type *get_ktype(struct kobject *kobj) { return kobj->ktype; } extern struct kobject *kset_find_obj(struct kset *, const char *); /* The global /sys/kernel/ kobject for people to chain off of */ extern struct kobject *kernel_kobj; /* The global /sys/kernel/mm/ kobject for people to chain off of */ extern struct kobject *mm_kobj; /* The global /sys/hypervisor/ kobject for people to chain off of */ extern struct kobject *hypervisor_kobj; /* The global /sys/power/ kobject for people to chain off of */ extern struct kobject *power_kobj; /* The global /sys/firmware/ kobject for people to chain off of */ extern struct kobject *firmware_kobj; int kobject_uevent(struct kobject *kobj, enum kobject_action action); int kobject_uevent_env(struct kobject *kobj, enum kobject_action action, char *envp[]); int kobject_synth_uevent(struct kobject *kobj, const char *buf, size_t count); __printf(2, 3) int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...); #endif /* _KOBJECT_H_ */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/namespace.c * * (C) Copyright Al Viro 2000, 2001 * * Based on code from fs/super.c, copyright Linus Torvalds and others. * Heavily rewritten. */ #include <linux/syscalls.h> #include <linux/export.h> #include <linux/capability.h> #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> #include <linux/init.h> /* init_rootfs */ #include <linux/fs_struct.h> /* get_fs_root et.al. */ #include <linux/fsnotify.h> /* fsnotify_vfsmount_delete */ #include <linux/file.h> #include <linux/uaccess.h> #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/memblock.h> #include <linux/task_work.h> #include <linux/sched/task.h> #include <uapi/linux/mount.h> #include <linux/fs_context.h> #include <linux/shmem_fs.h> #include "pnode.h" #include "internal.h" /* Maximum number of mounts in a mount namespace */ unsigned int sysctl_mount_max __read_mostly = 100000; static unsigned int m_hash_mask __read_mostly; static unsigned int m_hash_shift __read_mostly; static unsigned int mp_hash_mask __read_mostly; static unsigned int mp_hash_shift __read_mostly; static __initdata unsigned long mhash_entries; static int __init set_mhash_entries(char *str) { if (!str) return 0; mhash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mhash_entries=", set_mhash_entries); static __initdata unsigned long mphash_entries; static int __init set_mphash_entries(char *str) { if (!str) return 0; mphash_entries = simple_strtoul(str, &str, 0); return 1; } __setup("mphash_entries=", set_mphash_entries); static u64 event; static DEFINE_IDA(mnt_id_ida); static DEFINE_IDA(mnt_group_ida); static struct hlist_head *mount_hashtable __read_mostly; static struct hlist_head *mountpoint_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; static DECLARE_RWSEM(namespace_sem); static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ /* /sys/fs */ struct kobject *fs_kobj; EXPORT_SYMBOL_GPL(fs_kobj); /* * vfsmount lock may be taken for read to prevent changes to the * vfsmount hash, ie. during mountpoint lookups or walking back * up the tree. * * It should be taken for write in all cases where the vfsmount * tree or hash is modified or when a vfsmount structure is modified. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(mount_lock); static inline struct hlist_head *m_hash(struct vfsmount *mnt, struct dentry *dentry) { unsigned long tmp = ((unsigned long)mnt / L1_CACHE_BYTES); tmp += ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> m_hash_shift); return &mount_hashtable[tmp & m_hash_mask]; } static inline struct hlist_head *mp_hash(struct dentry *dentry) { unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); tmp = tmp + (tmp >> mp_hash_shift); return &mountpoint_hashtable[tmp & mp_hash_mask]; } static int mnt_alloc_id(struct mount *mnt) { int res = ida_alloc(&mnt_id_ida, GFP_KERNEL); if (res < 0) return res; mnt->mnt_id = res; return 0; } static void mnt_free_id(struct mount *mnt) { ida_free(&mnt_id_ida, mnt->mnt_id); } /* * Allocate a new peer group ID */ static int mnt_alloc_group_id(struct mount *mnt) { int res = ida_alloc_min(&mnt_group_ida, 1, GFP_KERNEL); if (res < 0) return res; mnt->mnt_group_id = res; return 0; } /* * Release a peer group ID */ void mnt_release_group_id(struct mount *mnt) { ida_free(&mnt_group_ida, mnt->mnt_group_id); mnt->mnt_group_id = 0; } /* * vfsmount lock must be held for read */ static inline void mnt_add_count(struct mount *mnt, int n) { #ifdef CONFIG_SMP this_cpu_add(mnt->mnt_pcp->mnt_count, n); #else preempt_disable(); mnt->mnt_count += n; preempt_enable(); #endif } /* * vfsmount lock must be held for write */ int mnt_get_count(struct mount *mnt) { #ifdef CONFIG_SMP int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_count; } return count; #else return mnt->mnt_count; #endif } static struct mount *alloc_vfsmnt(const char *name) { struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { int err; err = mnt_alloc_id(mnt); if (err) goto out_free_cache; if (name) { mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL); if (!mnt->mnt_devname) goto out_free_id; } #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); if (!mnt->mnt_pcp) goto out_free_devname; this_cpu_add(mnt->mnt_pcp->mnt_count, 1); #else mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); INIT_LIST_HEAD(&mnt->mnt_mounts); INIT_LIST_HEAD(&mnt->mnt_list); INIT_LIST_HEAD(&mnt->mnt_expire); INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); INIT_HLIST_NODE(&mnt->mnt_mp_list); INIT_LIST_HEAD(&mnt->mnt_umounting); INIT_HLIST_HEAD(&mnt->mnt_stuck_children); } return mnt; #ifdef CONFIG_SMP out_free_devname: kfree_const(mnt->mnt_devname); #endif out_free_id: mnt_free_id(mnt); out_free_cache: kmem_cache_free(mnt_cache, mnt); return NULL; } /* * Most r/o checks on a fs are for operations that take * discrete amounts of time, like a write() or unlink(). * We must keep track of when those operations start * (for permission checks) and when they end, so that * we can determine when writes are able to occur to * a filesystem. */ /* * __mnt_is_readonly: check whether a mount is read-only * @mnt: the mount to check for its write status * * This shouldn't be used directly ouside of the VFS. * It does not guarantee that the filesystem will stay * r/w, just that it is right *now*. This can not and * should not be used in place of IS_RDONLY(inode). * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ bool __mnt_is_readonly(struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(__mnt_is_readonly); static inline void mnt_inc_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_inc(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers++; #endif } static inline void mnt_dec_writers(struct mount *mnt) { #ifdef CONFIG_SMP this_cpu_dec(mnt->mnt_pcp->mnt_writers); #else mnt->mnt_writers--; #endif } static unsigned int mnt_get_writers(struct mount *mnt) { #ifdef CONFIG_SMP unsigned int count = 0; int cpu; for_each_possible_cpu(cpu) { count += per_cpu_ptr(mnt->mnt_pcp, cpu)->mnt_writers; } return count; #else return mnt->mnt_writers; #endif } static int mnt_is_readonly(struct vfsmount *mnt) { if (mnt->mnt_sb->s_readonly_remount) return 1; /* Order wrt setting s_flags/s_readonly_remount in do_remount() */ smp_rmb(); return __mnt_is_readonly(mnt); } /* * Most r/o & frozen checks on a fs are for operations that take discrete * amounts of time, like a write() or unlink(). We must keep track of when * those operations start (for permission checks) and when they end, so that we * can determine when writes are able to occur to a filesystem. */ /** * __mnt_want_write - get write access to a mount without freeze protection * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mnt it read-write) before * returning success. This operation does not protect against filesystem being * frozen. When the write operation is finished, __mnt_drop_write() must be * called. This is effectively a refcount. */ int __mnt_want_write(struct vfsmount *m) { struct mount *mnt = real_mount(m); int ret = 0; preempt_disable(); mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass * MNT_WRITE_HOLD loop below, so that the slowpath can see our * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) cpu_relax(); /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until * MNT_WRITE_HOLD is cleared. */ smp_rmb(); if (mnt_is_readonly(m)) { mnt_dec_writers(mnt); ret = -EROFS; } preempt_enable(); return ret; } /** * mnt_want_write - get write access to a mount * @m: the mount on which to take a write * * This tells the low-level filesystem that a write is about to be performed to * it, and makes sure that writes are allowed (mount is read-write, filesystem * is not frozen) before returning success. When the write operation is * finished, mnt_drop_write() must be called. This is effectively a refcount. */ int mnt_want_write(struct vfsmount *m) { int ret; sb_start_write(m->mnt_sb); ret = __mnt_want_write(m); if (ret) sb_end_write(m->mnt_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write); /** * mnt_clone_write - get write access to a mount * @mnt: the mount on which to take a write * * This is effectively like mnt_want_write, except * it must only be used to take an extra write reference * on a mountpoint that we already know has a write reference * on it. This allows some optimisation. * * After finished, mnt_drop_write must be called as usual to * drop the reference. */ int mnt_clone_write(struct vfsmount *mnt) { /* superblock may be r/o */ if (__mnt_is_readonly(mnt)) return -EROFS; preempt_disable(); mnt_inc_writers(real_mount(mnt)); preempt_enable(); return 0; } EXPORT_SYMBOL_GPL(mnt_clone_write); /** * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like __mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already */ int __mnt_want_write_file(struct file *file) { if (!(file->f_mode & FMODE_WRITER)) return __mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * * This is like mnt_want_write, but it takes a file and can * do some optimisations if the file is open for write already */ int mnt_want_write_file(struct file *file) { int ret; sb_start_write(file_inode(file)->i_sb); ret = __mnt_want_write_file(file); if (ret) sb_end_write(file_inode(file)->i_sb); return ret; } EXPORT_SYMBOL_GPL(mnt_want_write_file); /** * __mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done * performing writes to it. Must be matched with * __mnt_want_write() call above. */ void __mnt_drop_write(struct vfsmount *mnt) { preempt_disable(); mnt_dec_writers(real_mount(mnt)); preempt_enable(); } /** * mnt_drop_write - give up write access to a mount * @mnt: the mount on which to give up write access * * Tells the low-level filesystem that we are done performing writes to it and * also allows filesystem to be frozen again. Must be matched with * mnt_want_write() call above. */ void mnt_drop_write(struct vfsmount *mnt) { __mnt_drop_write(mnt); sb_end_write(mnt->mnt_sb); } EXPORT_SYMBOL_GPL(mnt_drop_write); void __mnt_drop_write_file(struct file *file) { __mnt_drop_write(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) { __mnt_drop_write_file(file); sb_end_write(file_inode(file)->i_sb); } EXPORT_SYMBOL(mnt_drop_write_file); static int mnt_make_readonly(struct mount *mnt) { int ret = 0; lock_mount_hash(); mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; /* * After storing MNT_WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); /* * With writers on hold, if this value is zero, then there are * definitely no active writers (although held writers may subsequently * increment the count, they'll have to wait, and decrement it after * seeing MNT_READONLY). * * It is OK to have counter incremented on one CPU and decremented on * another: the sum will add up correctly. The danger would be when we * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. * MNT_WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on * MNT_WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) ret = -EBUSY; else mnt->mnt.mnt_flags |= MNT_READONLY; /* * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; unlock_mount_hash(); return ret; } static int __mnt_unmake_readonly(struct mount *mnt) { lock_mount_hash(); mnt->mnt.mnt_flags &= ~MNT_READONLY; unlock_mount_hash(); return 0; } int sb_prepare_remount_readonly(struct super_block *sb) { struct mount *mnt; int err = 0; /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; lock_mount_hash(); list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; smp_mb(); if (mnt_get_writers(mnt) > 0) { err = -EBUSY; break; } } } if (!err && atomic_long_read(&sb->s_remove_count)) err = -EBUSY; if (!err) { sb->s_readonly_remount = 1; smp_wmb(); } list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; } unlock_mount_hash(); return err; } static void free_vfsmnt(struct mount *mnt) { kfree_const(mnt->mnt_devname); #ifdef CONFIG_SMP free_percpu(mnt->mnt_pcp); #endif kmem_cache_free(mnt_cache, mnt); } static void delayed_free_vfsmnt(struct rcu_head *head) { free_vfsmnt(container_of(head, struct mount, mnt_rcu)); } /* call under rcu_read_lock */ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { struct mount *mnt; if (read_seqretry(&mount_lock, seq)) return 1; if (bastard == NULL) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); smp_mb(); // see mntput_no_expire() if (likely(!read_seqretry(&mount_lock, seq))) return 0; if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { mnt_add_count(mnt, -1); return 1; } lock_mount_hash(); if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; } unlock_mount_hash(); /* caller will mntput() */ return -1; } /* call under rcu_read_lock */ bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) { int res = __legitimize_mnt(bastard, seq); if (likely(!res)) return true; if (unlikely(res < 0)) { rcu_read_unlock(); mntput(bastard); rcu_read_lock(); } return false; } /* * find the first mount at @dentry on vfsmount @mnt. * call under rcu_read_lock() */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { struct hlist_head *head = m_hash(mnt, dentry); struct mount *p; hlist_for_each_entry_rcu(p, head, mnt_hash) if (&p->mnt_parent->mnt == mnt && p->mnt_mountpoint == dentry) return p; return NULL; } /* * lookup_mnt - Return the first child mount mounted at path * * "First" means first mounted chronologically. If you create the * following mounts: * * mount /dev/sda1 /mnt * mount /dev/sda2 /mnt * mount /dev/sda3 /mnt * * Then lookup_mnt() on the base /mnt dentry in the root mount will * return successively the root dentry and vfsmount of /dev/sda1, then * /dev/sda2, then /dev/sda3, then NULL. * * lookup_mnt takes a reference to the found vfsmount. */ struct vfsmount *lookup_mnt(const struct path *path) { struct mount *child_mnt; struct vfsmount *m; unsigned seq; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); child_mnt = __lookup_mnt(path->mnt, path->dentry); m = child_mnt ? &child_mnt->mnt : NULL; } while (!legitimize_mnt(m, seq)); rcu_read_unlock(); return m; } static inline void lock_ns_list(struct mnt_namespace *ns) { spin_lock(&ns->ns_lock); } static inline void unlock_ns_list(struct mnt_namespace *ns) { spin_unlock(&ns->ns_lock); } static inline bool mnt_is_cursor(struct mount *mnt) { return mnt->mnt.mnt_flags & MNT_CURSOR; } /* * __is_local_mountpoint - Test to see if dentry is a mountpoint in the * current mount namespace. * * The common case is dentries are not mountpoints at all and that * test is handled inline. For the slow case when we are actually * dealing with a mountpoint of some kind, walk through all of the * mounts in the current mount namespace and test to see if the dentry * is a mountpoint. * * The mount_hashtable is not usable in the context because we * need to identify all mounts that may be in the current mount * namespace not just a mount that happens to have some specified * parent mount. */ bool __is_local_mountpoint(struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt; bool is_covered = false; down_read(&namespace_sem); lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { if (mnt_is_cursor(mnt)) continue; is_covered = (mnt->mnt_mountpoint == dentry); if (is_covered) break; } unlock_ns_list(ns); up_read(&namespace_sem); return is_covered; } static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { mp->m_count++; return mp; } } return NULL; } static struct mountpoint *get_mountpoint(struct dentry *dentry) { struct mountpoint *mp, *new = NULL; int ret; if (d_mountpoint(dentry)) { /* might be worth a WARN_ON() */ if (d_unlinked(dentry)) return ERR_PTR(-ENOENT); mountpoint: read_seqlock_excl(&mount_lock); mp = lookup_mountpoint(dentry); read_sequnlock_excl(&mount_lock); if (mp) goto done; } if (!new) new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!new) return ERR_PTR(-ENOMEM); /* Exactly one processes may set d_mounted */ ret = d_set_mounted(dentry); /* Someone else set d_mounted? */ if (ret == -EBUSY) goto mountpoint; /* The dentry is not available as a mountpoint? */ mp = ERR_PTR(ret); if (ret) goto done; /* Add the new mountpoint to the hash table */ read_seqlock_excl(&mount_lock); new->m_dentry = dget(dentry); new->m_count = 1; hlist_add_head(&new->m_hash, mp_hash(dentry)); INIT_HLIST_HEAD(&new->m_list); read_sequnlock_excl(&mount_lock); mp = new; new = NULL; done: kfree(new); return mp; } /* * vfsmount lock must be held. Additionally, the caller is responsible * for serializing calls for given disposal list. */ static void __put_mountpoint(struct mountpoint *mp, struct list_head *list) { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); dput_to_list(dentry, list); hlist_del(&mp->m_hash); kfree(mp); } } /* called with namespace_lock and vfsmount lock */ static void put_mountpoint(struct mountpoint *mp) { __put_mountpoint(mp, &ex_mountpoints); } static inline int check_mnt(struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } /* * vfsmount lock must be held for write */ static void touch_mnt_namespace(struct mnt_namespace *ns) { if (ns) { ns->event = ++event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static void __touch_mnt_namespace(struct mnt_namespace *ns) { if (ns && ns->event != event) { ns->event = event; wake_up_interruptible(&ns->poll); } } /* * vfsmount lock must be held for write */ static struct mountpoint *unhash_mnt(struct mount *mnt) { struct mountpoint *mp; mnt->mnt_parent = mnt; mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); hlist_del_init(&mnt->mnt_mp_list); mp = mnt->mnt_mp; mnt->mnt_mp = NULL; return mp; } /* * vfsmount lock must be held for write */ static void umount_mnt(struct mount *mnt) { put_mountpoint(unhash_mnt(mnt)); } /* * vfsmount lock must be held for write */ void mnt_set_mountpoint(struct mount *mnt, struct mountpoint *mp, struct mount *child_mnt) { mp->m_count++; mnt_add_count(mnt, 1); /* essentially, that's mntget */ child_mnt->mnt_mountpoint = mp->m_dentry; child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } static void __attach_mnt(struct mount *mnt, struct mount *parent) { hlist_add_head_rcu(&mnt->mnt_hash, m_hash(&parent->mnt, mnt->mnt_mountpoint)); list_add_tail(&mnt->mnt_child, &parent->mnt_mounts); } /* * vfsmount lock must be held for write */ static void attach_mnt(struct mount *mnt, struct mount *parent, struct mountpoint *mp) { mnt_set_mountpoint(parent, mp, mnt); __attach_mnt(mnt, parent); } void mnt_change_mountpoint(struct mount *parent, struct mountpoint *mp, struct mount *mnt) { struct mountpoint *old_mp = mnt->mnt_mp; struct mount *old_parent = mnt->mnt_parent; list_del_init(&mnt->mnt_child); hlist_del_init(&mnt->mnt_mp_list); hlist_del_init_rcu(&mnt->mnt_hash); attach_mnt(mnt, parent, mp); put_mountpoint(old_mp); mnt_add_count(old_parent, -1); } /* * vfsmount lock must be held for write */ static void commit_tree(struct mount *mnt) { struct mount *parent = mnt->mnt_parent; struct mount *m; LIST_HEAD(head); struct mnt_namespace *n = parent->mnt_ns; BUG_ON(parent == mnt); list_add_tail(&head, &mnt->mnt_list); list_for_each_entry(m, &head, mnt_list) m->mnt_ns = n; list_splice(&head, n->list.prev); n->mounts += n->pending_mounts; n->pending_mounts = 0; __attach_mnt(mnt, parent); touch_mnt_namespace(n); } static struct mount *next_mnt(struct mount *p, struct mount *root) { struct list_head *next = p->mnt_mounts.next; if (next == &p->mnt_mounts) { while (1) { if (p == root) return NULL; next = p->mnt_child.next; if (next != &p->mnt_parent->mnt_mounts) break; p = p->mnt_parent; } } return list_entry(next, struct mount, mnt_child); } static struct mount *skip_mnt_tree(struct mount *p) { struct list_head *prev = p->mnt_mounts.prev; while (prev != &p->mnt_mounts) { p = list_entry(prev, struct mount, mnt_child); prev = p->mnt_mounts.prev; } return p; } /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached * * Create a mount to an already configured superblock. If necessary, the * caller should invoke vfs_get_tree() before calling this. * * Note that this does not attach the mount to anything. */ struct vfsmount *vfs_create_mount(struct fs_context *fc) { struct mount *mnt; if (!fc->root) return ERR_PTR(-EINVAL); mnt = alloc_vfsmnt(fc->source ?: "none"); if (!mnt) return ERR_PTR(-ENOMEM); if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; atomic_inc(&fc->root->d_sb->s_active); mnt->mnt.mnt_sb = fc->root->d_sb; mnt->mnt.mnt_root = dget(fc->root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); struct vfsmount *fc_mount(struct fs_context *fc) { int err = vfs_get_tree(fc); if (!err) { up_write(&fc->root->d_sb->s_umount); return vfs_create_mount(fc); } return ERR_PTR(err); } EXPORT_SYMBOL(fc_mount); struct vfsmount *vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void *data) { struct fs_context *fc; struct vfsmount *mnt; int ret = 0; if (!type) return ERR_PTR(-EINVAL); fc = fs_context_for_mount(type, flags); if (IS_ERR(fc)) return ERR_CAST(fc); if (name) ret = vfs_parse_fs_string(fc, "source", name, strlen(name)); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) mnt = fc_mount(fc); else mnt = ERR_PTR(ret); put_fs_context(fc); return mnt; } EXPORT_SYMBOL_GPL(vfs_kern_mount); struct vfsmount * vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, const char *name, void *data) { /* Until it is worked out how to pass the user namespace * through from the parent mount to the submount don't support * unprivileged mounts with submounts. */ if (mountpoint->d_sb->s_user_ns != &init_user_ns) return ERR_PTR(-EPERM); return vfs_kern_mount(type, SB_SUBMOUNT, name, data); } EXPORT_SYMBOL_GPL(vfs_submount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { struct super_block *sb = old->mnt.mnt_sb; struct mount *mnt; int err; mnt = alloc_vfsmnt(old->mnt_devname); if (!mnt) return ERR_PTR(-ENOMEM); if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE)) mnt->mnt_group_id = 0; /* not a peer of original */ else mnt->mnt_group_id = old->mnt_group_id; if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { err = mnt_alloc_group_id(mnt); if (err) goto out_free; } mnt->mnt.mnt_flags = old->mnt.mnt_flags; mnt->mnt.mnt_flags &= ~(MNT_WRITE_HOLD|MNT_MARKED|MNT_INTERNAL); atomic_inc(&sb->s_active); mnt->mnt.mnt_sb = sb; mnt->mnt.mnt_root = dget(root); mnt->mnt_mountpoint = mnt->mnt.mnt_root; mnt->mnt_parent = mnt; lock_mount_hash(); list_add_tail(&mnt->mnt_instance, &sb->s_mounts); unlock_mount_hash(); if ((flag & CL_SLAVE) || ((flag & CL_SHARED_TO_SLAVE) && IS_MNT_SHARED(old))) { list_add(&mnt->mnt_slave, &old->mnt_slave_list); mnt->mnt_master = old; CLEAR_MNT_SHARED(mnt); } else if (!(flag & CL_PRIVATE)) { if ((flag & CL_MAKE_SHARED) || IS_MNT_SHARED(old)) list_add(&mnt->mnt_share, &old->mnt_share); if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; } else { CLEAR_MNT_SHARED(mnt); } if (flag & CL_MAKE_SHARED) set_mnt_shared(mnt); /* stick the duplicate mount on the same expiry list * as the original if that was on one */ if (flag & CL_EXPIRE) { if (!list_empty(&old->mnt_expire)) list_add(&mnt->mnt_expire, &old->mnt_expire); } return mnt; out_free: mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); } static void cleanup_mnt(struct mount *mnt) { struct hlist_node *p; struct mount *m; /* * The warning here probably indicates that somebody messed * up a mnt_want/drop_write() pair. If this happens, the * filesystem was probably unable to make r/w->r/o transitions. * The locking used to deal with mnt_count decrement provides barriers, * so mnt_get_writers() below is safe. */ WARN_ON(mnt_get_writers(mnt)); if (unlikely(mnt->mnt_pins.first)) mnt_pin_kill(mnt); hlist_for_each_entry_safe(m, p, &mnt->mnt_stuck_children, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } fsnotify_vfsmount_delete(&mnt->mnt); dput(mnt->mnt.mnt_root); deactivate_super(mnt->mnt.mnt_sb); mnt_free_id(mnt); call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); } static void __cleanup_mnt(struct rcu_head *head) { cleanup_mnt(container_of(head, struct mount, mnt_rcu)); } static LLIST_HEAD(delayed_mntput_list); static void delayed_mntput(struct work_struct *unused) { struct llist_node *node = llist_del_all(&delayed_mntput_list); struct mount *m, *t; llist_for_each_entry_safe(m, t, node, mnt_llist) cleanup_mnt(m); } static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); static void mntput_no_expire(struct mount *mnt) { LIST_HEAD(list); int count; rcu_read_lock(); if (likely(READ_ONCE(mnt->mnt_ns))) { /* * Since we don't do lock_mount_hash() here, * ->mnt_ns can change under us. However, if it's * non-NULL, then there's a reference that won't * be dropped until after an RCU delay done after * turning ->mnt_ns NULL. So if we observe it * non-NULL under rcu_read_lock(), the reference * we are dropping is not the final one. */ mnt_add_count(mnt, -1); rcu_read_unlock(); return; } lock_mount_hash(); /* * make sure that if __legitimize_mnt() has not seen us grab * mount_lock, we'll see their refcount increment here. */ smp_mb(); mnt_add_count(mnt, -1); count = mnt_get_count(mnt); if (count != 0) { WARN_ON(count < 0); rcu_read_unlock(); unlock_mount_hash(); return; } if (unlikely(mnt->mnt.mnt_flags & MNT_DOOMED)) { rcu_read_unlock(); unlock_mount_hash(); return; } mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); list_del(&mnt->mnt_instance); if (unlikely(!list_empty(&mnt->mnt_mounts))) { struct mount *p, *tmp; list_for_each_entry_safe(p, tmp, &mnt->mnt_mounts, mnt_child) { __put_mountpoint(unhash_mnt(p), &list); hlist_add_head(&p->mnt_umount, &mnt->mnt_stuck_children); } } unlock_mount_hash(); shrink_dentry_list(&list); if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { struct task_struct *task = current; if (likely(!(task->flags & PF_KTHREAD))) { init_task_work(&mnt->mnt_rcu, __cleanup_mnt); if (!task_work_add(task, &mnt->mnt_rcu, TWA_RESUME)) return; } if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) schedule_delayed_work(&delayed_mntput_work, 1); return; } cleanup_mnt(mnt); } void mntput(struct vfsmount *mnt) { if (mnt) { struct mount *m = real_mount(mnt); /* avoid cacheline pingpong, hope gcc doesn't get "smart" */ if (unlikely(m->mnt_expiry_mark)) m->mnt_expiry_mark = 0; mntput_no_expire(m); } } EXPORT_SYMBOL(mntput); struct vfsmount *mntget(struct vfsmount *mnt) { if (mnt) mnt_add_count(real_mount(mnt), 1); return mnt; } EXPORT_SYMBOL(mntget); /* path_is_mountpoint() - Check if path is a mount in the current * namespace. * * d_mountpoint() can only be used reliably to establish if a dentry is * not mounted in any namespace and that common case is handled inline. * d_mountpoint() isn't aware of the possibility there may be multiple * mounts using a given dentry in a different namespace. This function * checks if the passed in path is a mountpoint rather than the dentry * alone. */ bool path_is_mountpoint(const struct path *path) { unsigned seq; bool res; if (!d_mountpoint(path->dentry)) return false; rcu_read_lock(); do { seq = read_seqbegin(&mount_lock); res = __path_is_mountpoint(path); } while (read_seqretry(&mount_lock, seq)); rcu_read_unlock(); return res; } EXPORT_SYMBOL(path_is_mountpoint); struct vfsmount *mnt_clone_internal(const struct path *path) { struct mount *p; p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE); if (IS_ERR(p)) return ERR_CAST(p); p->mnt.mnt_flags |= MNT_INTERNAL; return &p->mnt; } #ifdef CONFIG_PROC_FS static struct mount *mnt_list_next(struct mnt_namespace *ns, struct list_head *p) { struct mount *mnt, *ret = NULL; lock_ns_list(ns); list_for_each_continue(p, &ns->list) { mnt = list_entry(p, typeof(*mnt), mnt_list); if (!mnt_is_cursor(mnt)) { ret = mnt; break; } } unlock_ns_list(ns); return ret; } /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_mounts *p = m->private; struct list_head *prev; down_read(&namespace_sem); if (!*pos) { prev = &p->ns->list; } else { prev = &p->cursor.mnt_list; /* Read after we'd reached the end? */ if (list_empty(prev)) return NULL; } return mnt_list_next(p->ns, prev); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_mounts *p = m->private; struct mount *mnt = v; ++*pos; return mnt_list_next(p->ns, &mnt->mnt_list); } static void m_stop(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; struct mount *mnt = v; lock_ns_list(p->ns); if (mnt) list_move_tail(&p->cursor.mnt_list, &mnt->mnt_list); else list_del_init(&p->cursor.mnt_list); unlock_ns_list(p->ns); up_read(&namespace_sem); } static int m_show(struct seq_file *m, void *v) { struct proc_mounts *p = m->private; struct mount *r = v; return p->show(m, &r->mnt); } const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = m_show, }; void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor) { down_read(&namespace_sem); lock_ns_list(ns); list_del(&cursor->mnt_list); unlock_ns_list(ns); up_read(&namespace_sem); } #endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy * @mnt: root of mount tree * * This is called to check if a tree of mounts has any * open files, pwds, chroots or sub mounts that are * busy. */ int may_umount_tree(struct vfsmount *m) { struct mount *mnt = real_mount(m); int actual_refs = 0; int minimum_refs = 0; struct mount *p; BUG_ON(!m); /* write lock needed for mnt_get_count */ lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { actual_refs += mnt_get_count(p); minimum_refs += 2; } unlock_mount_hash(); if (actual_refs > minimum_refs) return 0; return 1; } EXPORT_SYMBOL(may_umount_tree); /** * may_umount - check if a mount point is busy * @mnt: root of mount * * This is called to check if a mount point has any * open files, pwds, chroots or sub mounts. If the * mount has sub mounts this will return busy * regardless of whether the sub mounts are busy. * * Doesn't take quota and stuff into account. IOW, in some cases it will * give false negatives. The main reason why it's here is that we need * a non-destructive way to look for easily umountable filesystems. */ int may_umount(struct vfsmount *mnt) { int ret = 1; down_read(&namespace_sem); lock_mount_hash(); if (propagate_mount_busy(real_mount(mnt), 2)) ret = 0; unlock_mount_hash(); up_read(&namespace_sem); return ret; } EXPORT_SYMBOL(may_umount); static void namespace_unlock(void) { struct hlist_head head; struct hlist_node *p; struct mount *m; LIST_HEAD(list); hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); up_write(&namespace_sem); shrink_dentry_list(&list); if (likely(hlist_empty(&head))) return; synchronize_rcu_expedited(); hlist_for_each_entry_safe(m, p, &head, mnt_umount) { hlist_del(&m->mnt_umount); mntput(&m->mnt); } } static inline void namespace_lock(void) { down_write(&namespace_sem); } enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, UMOUNT_CONNECTED = 4, }; static bool disconnect_mount(struct mount *mnt, enum umount_tree_flags how) { /* Leaving mounts connected is only valid for lazy umounts */ if (how & UMOUNT_SYNC) return true; /* A mount without a parent has nothing to be connected to */ if (!mnt_has_parent(mnt)) return true; /* Because the reference counting rules change when mounts are * unmounted and connected, umounted mounts may not be * connected to mounted mounts. */ if (!(mnt->mnt_parent->mnt.mnt_flags & MNT_UMOUNT)) return true; /* Has it been requested that the mount remain connected? */ if (how & UMOUNT_CONNECTED) return false; /* Is the mount locked such that it needs to remain connected? */ if (IS_MNT_LOCKED(mnt)) return false; /* By default disconnect the mount */ return true; } /* * mount_lock must be held * namespace_sem must be held for write */ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) { LIST_HEAD(tmp_list); struct mount *p; if (how & UMOUNT_PROPAGATE) propagate_mount_unlock(mnt); /* Gather the mounts to umount */ for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt.mnt_flags |= MNT_UMOUNT; list_move(&p->mnt_list, &tmp_list); } /* Hide the mounts from mnt_mounts */ list_for_each_entry(p, &tmp_list, mnt_list) { list_del_init(&p->mnt_child); } /* Add propogated mounts to the tmp_list */ if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; p = list_first_entry(&tmp_list, struct mount, mnt_list); list_del_init(&p->mnt_expire); list_del_init(&p->mnt_list); ns = p->mnt_ns; if (ns) { ns->mounts--; __touch_mnt_namespace(ns); } p->mnt_ns = NULL; if (how & UMOUNT_SYNC) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; disconnect = disconnect_mount(p, how); if (mnt_has_parent(p)) { mnt_add_count(p->mnt_parent, -1); if (!disconnect) { /* Don't forget about p */ list_add_tail(&p->mnt_child, &p->mnt_parent->mnt_mounts); } else { umount_mnt(p); } } change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); } } static void shrink_submounts(struct mount *mnt); static int do_umount_root(struct super_block *sb) { int ret = 0; down_write(&sb->s_umount); if (!sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, SB_RDONLY, SB_RDONLY); if (IS_ERR(fc)) { ret = PTR_ERR(fc); } else { ret = parse_monolithic_mount_data(fc, NULL); if (!ret) ret = reconfigure_super(fc); put_fs_context(fc); } } up_write(&sb->s_umount); return ret; } static int do_umount(struct mount *mnt, int flags) { struct super_block *sb = mnt->mnt.mnt_sb; int retval; retval = security_sb_umount(&mnt->mnt, flags); if (retval) return retval; /* * Allow userspace to request a mountpoint be expired rather than * unmounting unconditionally. Unmount only happens if: * (1) the mark is already set (the mark is cleared by mntput()) * (2) the usage count == 1 [parent vfsmount] + 1 [sys_umount] */ if (flags & MNT_EXPIRE) { if (&mnt->mnt == current->fs->root.mnt || flags & (MNT_FORCE | MNT_DETACH)) return -EINVAL; /* * probably don't strictly need the lock here if we examined * all race cases, but it's a slowpath. */ lock_mount_hash(); if (mnt_get_count(mnt) != 2) { unlock_mount_hash(); return -EBUSY; } unlock_mount_hash(); if (!xchg(&mnt->mnt_expiry_mark, 1)) return -EAGAIN; } /* * If we may have to abort operations to get out of this * mount, and they will themselves hold resources we must * allow the fs to do things. In the Unix tradition of * 'Gee thats tricky lets do it in userspace' the umount_begin * might fail to complete on the first run through as other tasks * must return, and the like. Thats for the mount program to worry * about for the moment. */ if (flags & MNT_FORCE && sb->s_op->umount_begin) { sb->s_op->umount_begin(sb); } /* * No sense to grab the lock for this test, but test itself looks * somewhat bogus. Suggestions for better replacement? * Ho-hum... In principle, we might treat that as umount + switch * to rootfs. GC would eventually take care of the old vfsmount. * Actually it makes sense, especially if rootfs would contain a * /reboot - static binary that would close all descriptors and * call reboot(9). Then init(8) could umount root and exec /reboot. */ if (&mnt->mnt == current->fs->root.mnt && !(flags & MNT_DETACH)) { /* * Special case for "unmounting" root ... * we just try to remount it readonly. */ if (!ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return do_umount_root(sb); } namespace_lock(); lock_mount_hash(); /* Recheck MNT_LOCKED with the locks held */ retval = -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) goto out; event++; if (flags & MNT_DETACH) { if (!list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { if (!list_empty(&mnt->mnt_list)) umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); retval = 0; } } out: unlock_mount_hash(); namespace_unlock(); return retval; } /* * __detach_mounts - lazily unmount all mounts on the specified dentry * * During unlink, rmdir, and d_drop it is possible to loose the path * to an existing mountpoint, and wind up leaking the mount. * detach_mounts allows lazily unmounting those mounts instead of * leaking them. * * The caller may hold dentry->d_inode->i_mutex. */ void __detach_mounts(struct dentry *dentry) { struct mountpoint *mp; struct mount *mnt; namespace_lock(); lock_mount_hash(); mp = lookup_mountpoint(dentry); if (!mp) goto out_unlock; event++; while (!hlist_empty(&mp->m_list)) { mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); if (mnt->mnt.mnt_flags & MNT_UMOUNT) { umount_mnt(mnt); hlist_add_head(&mnt->mnt_umount, &unmounted); } else umount_tree(mnt, UMOUNT_CONNECTED); } put_mountpoint(mp); out_unlock: unlock_mount_hash(); namespace_unlock(); } /* * Is the caller allowed to modify his namespace? */ static inline bool may_mount(void) { return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } #ifdef CONFIG_MANDATORY_FILE_LOCKING static bool may_mandlock(void) { pr_warn_once("======================================================\n" "WARNING: the mand mount option is being deprecated and\n" " will be removed in v5.15!\n" "======================================================\n"); return capable(CAP_SYS_ADMIN); } #else static inline bool may_mandlock(void) { pr_warn("VFS: \"mand\" mount option not supported"); return false; } #endif static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); if (!may_mount()) return -EPERM; if (path->dentry != path->mnt->mnt_root) return -EINVAL; if (!check_mnt(mnt)) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) return -EPERM; return 0; } // caller is responsible for flags being sane int path_umount(struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; ret = can_umount(path, flags); if (!ret) ret = do_umount(mnt, flags); /* we mustn't call path_put() as that would clear mnt_expiry_mark */ dput(path->dentry); mntput_no_expire(mnt); return ret; } static int ksys_umount(char __user *name, int flags) { int lookup_flags = LOOKUP_MOUNTPOINT; struct path path; int ret; // basic validity checks done first if (flags & ~(MNT_FORCE | MNT_DETACH | MNT_EXPIRE | UMOUNT_NOFOLLOW)) return -EINVAL; if (!(flags & UMOUNT_NOFOLLOW)) lookup_flags |= LOOKUP_FOLLOW; ret = user_path_at(AT_FDCWD, name, lookup_flags, &path); if (ret) return ret; return path_umount(&path, flags); } SYSCALL_DEFINE2(umount, char __user *, name, int, flags) { return ksys_umount(name, flags); } #ifdef __ARCH_WANT_SYS_OLDUMOUNT /* * The 2.0 compatible umount. No flags. */ SYSCALL_DEFINE1(oldumount, char __user *, name) { return ksys_umount(name, 0); } #endif static bool is_mnt_ns_file(struct dentry *dentry) { /* Is this a proxy for a mount namespace? */ return dentry->d_op == &ns_dentry_operations && dentry->d_fsdata == &mntns_operations; } static struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } struct ns_common *from_mnt_ns(struct mnt_namespace *mnt) { return &mnt->ns; } static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ struct mnt_namespace *mnt_ns; if (!is_mnt_ns_file(dentry)) return false; mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } struct mount *copy_tree(struct mount *mnt, struct dentry *dentry, int flag) { struct mount *res, *p, *q, *r, *parent; if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(mnt)) return ERR_PTR(-EINVAL); if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(dentry)) return ERR_PTR(-EINVAL); res = q = clone_mnt(mnt, dentry, flag); if (IS_ERR(q)) return q; q->mnt_mountpoint = mnt->mnt_mountpoint; p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { struct mount *s; if (!is_subdir(r->mnt_mountpoint, dentry)) continue; for (s = r; s; s = next_mnt(s, r)) { if (!(flag & CL_COPY_UNBINDABLE) && IS_MNT_UNBINDABLE(s)) { if (s->mnt.mnt_flags & MNT_LOCKED) { /* Both unbindable and locked. */ q = ERR_PTR(-EPERM); goto out; } else { s = skip_mnt_tree(s); continue; } } if (!(flag & CL_COPY_MNT_NS_FILE) && is_mnt_ns_file(s->mnt.mnt_root)) { s = skip_mnt_tree(s); continue; } while (p != s->mnt_parent) { p = p->mnt_parent; q = q->mnt_parent; } p = s; parent = q; q = clone_mnt(p, p->mnt.mnt_root, flag); if (IS_ERR(q)) goto out; lock_mount_hash(); list_add_tail(&q->mnt_list, &res->mnt_list); attach_mnt(q, parent, p->mnt_mp); unlock_mount_hash(); } } return res; out: if (res) { lock_mount_hash(); umount_tree(res, UMOUNT_SYNC); unlock_mount_hash(); } return q; } /* Caller should check returned pointer for errors */ struct vfsmount *collect_mounts(const struct path *path) { struct mount *tree; namespace_lock(); if (!check_mnt(real_mount(path->mnt))) tree = ERR_PTR(-EINVAL); else tree = copy_tree(real_mount(path->mnt), path->dentry, CL_COPY_ALL | CL_PRIVATE); namespace_unlock(); if (IS_ERR(tree)) return ERR_CAST(tree); return &tree->mnt; } static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); void dissolve_on_fput(struct vfsmount *mnt) { struct mnt_namespace *ns; namespace_lock(); lock_mount_hash(); ns = real_mount(mnt)->mnt_ns; if (ns) { if (is_anon_ns(ns)) umount_tree(real_mount(mnt), UMOUNT_CONNECTED); else ns = NULL; } unlock_mount_hash(); namespace_unlock(); if (ns) free_mnt_ns(ns); } void drop_collected_mounts(struct vfsmount *mnt) { namespace_lock(); lock_mount_hash(); umount_tree(real_mount(mnt), 0); unlock_mount_hash(); namespace_unlock(); } static bool has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { if (!is_subdir(child->mnt_mountpoint, dentry)) continue; if (child->mnt.mnt_flags & MNT_LOCKED) return true; } return false; } /** * clone_private_mount - create a private clone of a path * * This creates a new vfsmount, which will be the clone of @path. The new will * not be attached anywhere in the namespace and will be private (i.e. changes * to the originating mount won't be propagated into this). * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) { struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; down_read(&namespace_sem); if (IS_MNT_UNBINDABLE(old_mnt)) goto invalid; if (!check_mnt(old_mnt)) goto invalid; if (has_locked_children(old_mnt, path->dentry)) goto invalid; new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); up_read(&namespace_sem); if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; return &new_mnt->mnt; invalid: up_read(&namespace_sem); return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { struct mount *mnt; int res = f(root, arg); if (res) return res; list_for_each_entry(mnt, &real_mount(root)->mnt_list, mnt_list) { res = f(&mnt->mnt, arg); if (res) return res; } return 0; } static void lock_mnt_tree(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { int flags = p->mnt.mnt_flags; /* Don't allow unprivileged users to change mount flags */ flags |= MNT_LOCK_ATIME; if (flags & MNT_READONLY) flags |= MNT_LOCK_READONLY; if (flags & MNT_NODEV) flags |= MNT_LOCK_NODEV; if (flags & MNT_NOSUID) flags |= MNT_LOCK_NOSUID; if (flags & MNT_NOEXEC) flags |= MNT_LOCK_NOEXEC; /* Don't allow unprivileged users to reveal what is under a mount */ if (list_empty(&p->mnt_expire)) flags |= MNT_LOCKED; p->mnt.mnt_flags = flags; } } static void cleanup_group_ids(struct mount *mnt, struct mount *end) { struct mount *p; for (p = mnt; p != end; p = next_mnt(p, mnt)) { if (p->mnt_group_id && !IS_MNT_SHARED(p)) mnt_release_group_id(p); } } static int invent_group_ids(struct mount *mnt, bool recurse) { struct mount *p; for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { int err = mnt_alloc_group_id(p); if (err) { cleanup_group_ids(mnt, p); return err; } } } return 0; } int count_mounts(struct mnt_namespace *ns, struct mount *mnt) { unsigned int max = READ_ONCE(sysctl_mount_max); unsigned int mounts = 0, old, pending, sum; struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) mounts++; old = ns->mounts; pending = ns->pending_mounts; sum = old + pending; if ((old > sum) || (pending > sum) || (max < sum) || (mounts > (max - sum))) return -ENOSPC; ns->pending_mounts = pending + mounts; return 0; } /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached * @parent_nd : if non-null, detach the source_mnt from its parent and * store the parent mount and mountpoint dentry. * (done when source_mnt is moved) * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. * --------------------------------------------------------------------------- * | BIND MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (++) | shared (+) | shared(+++)| invalid | * | | | | | | * |non-shared| shared (+) | private | slave (*) | invalid | * *************************************************************************** * A bind operation clones the source mount and mounts the clone on the * destination mount. * * (++) the cloned mount is propagated to all the mounts in the propagation * tree of the destination mount and the cloned mount is added to * the peer group of the source mount. * (+) the cloned mount is created under the destination mount and is marked * as shared. The cloned mount is added to the peer group of the source * mount. * (+++) the mount is propagated to all the mounts in the propagation tree * of the destination mount and the cloned mount is made slave * of the same master as that of the source mount. The cloned mount * is marked as 'shared and slave'. * (*) the cloned mount is made a slave of the same master as that of the * source mount. * * --------------------------------------------------------------------------- * | MOVE MOUNT OPERATION | * |************************************************************************** * | source-->| shared | private | slave | unbindable | * | dest | | | | | * | | | | | | | * | v | | | | | * |************************************************************************** * | shared | shared (+) | shared (+) | shared(+++) | invalid | * | | | | | | * |non-shared| shared (+*) | private | slave (*) | unbindable | * *************************************************************************** * * (+) the mount is moved to the destination. And is then propagated to * all the mounts in the propagation tree of the destination mount. * (+*) the mount is moved to the destination. * (+++) the mount is moved to the destination and is then propagated to * all the mounts belonging to the destination mount's propagation tree. * the mount is marked as 'shared and slave'. * (*) the mount continues to be a slave at the new location. * * if the source mount is a tree, the operations explained above is * applied to each mount in the tree. * Must be called without spinlocks held, since this function can sleep * in allocations. */ static int attach_recursive_mnt(struct mount *source_mnt, struct mount *dest_mnt, struct mountpoint *dest_mp, bool moving) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct mountpoint *smp; struct mount *child, *p; struct hlist_node *n; int err; /* Preallocate a mountpoint in case the new mounts need * to be tucked under other mounts. */ smp = get_mountpoint(source_mnt->mnt.mnt_root); if (IS_ERR(smp)) return PTR_ERR(smp); /* Is there space to add these mounts to the mount namespace? */ if (!moving) { err = count_mounts(ns, source_mnt); if (err) goto out; } if (IS_MNT_SHARED(dest_mnt)) { err = invent_group_ids(source_mnt, true); if (err) goto out; err = propagate_mnt(dest_mnt, dest_mp, source_mnt, &tree_list); lock_mount_hash(); if (err) goto out_cleanup_ids; for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } else { lock_mount_hash(); } if (moving) { unhash_mnt(source_mnt); attach_mnt(source_mnt, dest_mnt, dest_mp); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { /* move from anon - the caller will destroy */ list_del_init(&source_mnt->mnt_ns->list); } mnt_set_mountpoint(dest_mnt, dest_mp, source_mnt); commit_tree(source_mnt); } hlist_for_each_entry_safe(child, n, &tree_list, mnt_hash) { struct mount *q; hlist_del_init(&child->mnt_hash); q = __lookup_mnt(&child->mnt_parent->mnt, child->mnt_mountpoint); if (q) mnt_change_mountpoint(child, smp, q); /* Notice when we are propagating across user namespaces */ if (child->mnt_parent->mnt_ns->user_ns != user_ns) lock_mnt_tree(child); child->mnt.mnt_flags &= ~MNT_LOCKED; commit_tree(child); } put_mountpoint(smp); unlock_mount_hash(); return 0; out_cleanup_ids: while (!hlist_empty(&tree_list)) { child = hlist_entry(tree_list.first, struct mount, mnt_hash); child->mnt_parent->mnt_ns->pending_mounts = 0; umount_tree(child, UMOUNT_SYNC); } unlock_mount_hash(); cleanup_group_ids(source_mnt, NULL); out: ns->pending_mounts = 0; read_seqlock_excl(&mount_lock); put_mountpoint(smp); read_sequnlock_excl(&mount_lock); return err; } static struct mountpoint *lock_mount(struct path *path) { struct vfsmount *mnt; struct dentry *dentry = path->dentry; retry: inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { inode_unlock(dentry->d_inode); return ERR_PTR(-ENOENT); } namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { struct mountpoint *mp = get_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); inode_unlock(dentry->d_inode); return mp; } return mp; } namespace_unlock(); inode_unlock(path->dentry->d_inode); path_put(path); path->mnt = mnt; dentry = path->dentry = dget(mnt->mnt_root); goto retry; } static void unlock_mount(struct mountpoint *where) { struct dentry *dentry = where->m_dentry; read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); namespace_unlock(); inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; if (d_is_dir(mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; return attach_recursive_mnt(mnt, p, mp, false); } /* * Sanity check the flags to change_mnt_propagation. */ static int flags_to_propagation_type(int ms_flags) { int type = ms_flags & ~(MS_REC | MS_SILENT); /* Fail if any non-propagation flags are set */ if (type & ~(MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return 0; /* Only one propagation flag should be set */ if (!is_power_of_2(type)) return 0; return type; } /* * recursively change the type of the mountpoint. */ static int do_change_type(struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; int err = 0; if (path->dentry != path->mnt->mnt_root) return -EINVAL; type = flags_to_propagation_type(ms_flags); if (!type) return -EINVAL; namespace_lock(); if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) goto out_unlock; } lock_mount_hash(); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); unlock_mount_hash(); out_unlock: namespace_unlock(); return err; } static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); if (IS_MNT_UNBINDABLE(old)) return mnt; if (!check_mnt(old) && old_path->dentry->d_op != &ns_dentry_operations) return mnt; if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; if (recurse) mnt = copy_tree(old, old_path->dentry, CL_COPY_MNT_NS_FILE); else mnt = clone_mnt(old, old_path->dentry, 0); if (!IS_ERR(mnt)) mnt->mnt.mnt_flags &= ~MNT_LOCKED; return mnt; } /* * do loopback mount. */ static int do_loopback(struct path *path, const char *old_name, int recurse) { struct path old_path; struct mount *mnt = NULL, *parent; struct mountpoint *mp; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &old_path); if (err) return err; err = -EINVAL; if (mnt_ns_loop(old_path.dentry)) goto out; mp = lock_mount(path); if (IS_ERR(mp)) { err = PTR_ERR(mp); goto out; } parent = real_mount(path->mnt); if (!check_mnt(parent)) goto out2; mnt = __do_loopback(&old_path, recurse); if (IS_ERR(mnt)) { err = PTR_ERR(mnt); goto out2; } err = graft_tree(mnt, parent, mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } out2: unlock_mount(mp); out: path_put(&old_path); return err; } static struct file *open_detached_copy(struct path *path, bool recursive) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); struct mount *mnt, *p; struct file *file; if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); free_mnt_ns(ns); return ERR_CAST(mnt); } lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { p->mnt_ns = ns; ns->mounts++; } ns->root = mnt; list_add_tail(&ns->list, &mnt->mnt_list); mntget(&mnt->mnt); unlock_mount_hash(); namespace_unlock(); mntput(path->mnt); path->mnt = &mnt->mnt; file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); else file->f_mode |= FMODE_NEED_UNMOUNT; return file; } SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) { struct file *file; struct path path; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; int error; int fd; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) return -EINVAL; if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) return -EINVAL; if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; if (flags & AT_SYMLINK_NOFOLLOW) lookup_flags &= ~LOOKUP_FOLLOW; if (flags & AT_EMPTY_PATH) lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) return -EPERM; fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; error = user_path_at(dfd, filename, lookup_flags, &path); if (unlikely(error)) { file = ERR_PTR(error); } else { if (detached) file = open_detached_copy(&path, flags & AT_RECURSIVE); else file = dentry_open(&path, O_PATH, current_cred()); path_put(&path); } if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } fd_install(fd, file); return fd; } /* * Don't allow locked mount flags to be cleared. * * No locks need to be held here while testing the various MNT_LOCK * flags because those flags can never be cleared once they are set. */ static bool can_change_locked_flags(struct mount *mnt, unsigned int mnt_flags) { unsigned int fl = mnt->mnt.mnt_flags; if ((fl & MNT_LOCK_READONLY) && !(mnt_flags & MNT_READONLY)) return false; if ((fl & MNT_LOCK_NODEV) && !(mnt_flags & MNT_NODEV)) return false; if ((fl & MNT_LOCK_NOSUID) && !(mnt_flags & MNT_NOSUID)) return false; if ((fl & MNT_LOCK_NOEXEC) && !(mnt_flags & MNT_NOEXEC)) return false; if ((fl & MNT_LOCK_ATIME) && ((fl & MNT_ATIME_MASK) != (mnt_flags & MNT_ATIME_MASK))) return false; return true; } static int change_mount_ro_state(struct mount *mnt, unsigned int mnt_flags) { bool readonly_request = (mnt_flags & MNT_READONLY); if (readonly_request == __mnt_is_readonly(&mnt->mnt)) return 0; if (readonly_request) return mnt_make_readonly(mnt); return __mnt_unmake_readonly(mnt); } /* * Update the user-settable attributes on a mount. The caller must hold * sb->s_umount for writing. */ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) { lock_mount_hash(); mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK; mnt->mnt.mnt_flags = mnt_flags; touch_mnt_namespace(mnt->mnt_ns); unlock_mount_hash(); } static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; if (!__mnt_is_readonly(mnt) && (ktime_get_real_seconds() + TIME_UPTIME_SEC_MAX > sb->s_time_max)) { char *buf = (char *)__get_free_page(GFP_KERNEL); char *mntpath = buf ? d_path(mountpoint, buf, PAGE_SIZE) : ERR_PTR(-ENOMEM); struct tm tm; time64_to_tm(sb->s_time_max, 0, &tm); pr_warn("%s filesystem being %s at %s supports timestamps until %04ld (0x%llx)\n", sb->s_type->name, is_mounted(mnt) ? "remounted" : "mounted", mntpath, tm.tm_year+1900, (unsigned long long)sb->s_time_max); free_page((unsigned long)buf); } } /* * Handle reconfiguration of the mountpoint only without alteration of the * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); int ret; if (!check_mnt(mnt)) return -EINVAL; if (path->dentry != mnt->mnt.mnt_root) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; down_write(&sb->s_umount); ret = change_mount_ro_state(mnt, mnt_flags); if (ret == 0) set_mount_attributes(mnt, mnt_flags); up_write(&sb->s_umount); mnt_warn_timestamp_expiry(path, &mnt->mnt); return ret; } /* * change filesystem flags. dir should be a physical root of filesystem. * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ static int do_remount(struct path *path, int ms_flags, int sb_flags, int mnt_flags, void *data) { int err; struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); struct fs_context *fc; if (!check_mnt(mnt)) return -EINVAL; if (path->dentry != path->mnt->mnt_root) return -EINVAL; if (!can_change_locked_flags(mnt, mnt_flags)) return -EPERM; fc = fs_context_for_reconfigure(path->dentry, sb_flags, MS_RMT_MASK); if (IS_ERR(fc)) return PTR_ERR(fc); fc->oldapi = true; err = parse_monolithic_mount_data(fc, data); if (!err) { down_write(&sb->s_umount); err = -EPERM; if (ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) { err = reconfigure_super(fc); if (!err) set_mount_attributes(mnt, mnt_flags); } up_write(&sb->s_umount); } mnt_warn_timestamp_expiry(path, &mnt->mnt); put_fs_context(fc); return err; } static inline int tree_contains_unbindable(struct mount *mnt) { struct mount *p; for (p = mnt; p; p = next_mnt(p, mnt)) { if (IS_MNT_UNBINDABLE(p)) return 1; } return 0; } /* * Check that there aren't references to earlier/same mount namespaces in the * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. */ static bool check_for_nsfs_mounts(struct mount *subtree) { struct mount *p; bool ret = false; lock_mount_hash(); for (p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) goto out; ret = true; out: unlock_mount_hash(); return ret; } static int do_move_mount(struct path *old_path, struct path *new_path) { struct mnt_namespace *ns; struct mount *p; struct mount *old; struct mount *parent; struct mountpoint *mp, *old_mp; int err; bool attached; mp = lock_mount(new_path); if (IS_ERR(mp)) return PTR_ERR(mp); old = real_mount(old_path->mnt); p = real_mount(new_path->mnt); parent = old->mnt_parent; attached = mnt_has_parent(old); old_mp = old->mnt_mp; ns = old->mnt_ns; err = -EINVAL; /* The mountpoint must be in our namespace. */ if (!check_mnt(p)) goto out; /* The thing moved must be mounted... */ if (!is_mounted(&old->mnt)) goto out; /* ... and either ours or the root of anon namespace */ if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; if (old->mnt.mnt_flags & MNT_LOCKED) goto out; if (old_path->dentry != old_path->mnt->mnt_root) goto out; if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) goto out; /* * Don't move a mount residing in a shared parent. */ if (attached && IS_MNT_SHARED(parent)) goto out; /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) goto out; err = -ELOOP; if (!check_for_nsfs_mounts(old)) goto out; for (; mnt_has_parent(p); p = p->mnt_parent) if (p == old) goto out; err = attach_recursive_mnt(old, real_mount(new_path->mnt), mp, attached); if (err) goto out; /* if the mount is moved, it should no longer be expire * automatically */ list_del_init(&old->mnt_expire); if (attached) put_mountpoint(old_mp); out: unlock_mount(mp); if (!err) { if (attached) mntput_no_expire(parent); else free_mnt_ns(ns); } return err; } static int do_move_mount_old(struct path *path, const char *old_name) { struct path old_path; int err; if (!old_name || !*old_name) return -EINVAL; err = kern_path(old_name, LOOKUP_FOLLOW, &old_path); if (err) return err; err = do_move_mount(&old_path, path); path_put(&old_path); return err; } /* * add a mount into a namespace's mount tree */ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, struct path *path, int mnt_flags) { struct mount *parent = real_mount(path->mnt); mnt_flags &= ~MNT_INTERNAL_FLAGS; if (unlikely(!check_mnt(parent))) { /* that's acceptable only for automounts done in private ns */ if (!(mnt_flags & MNT_SHRINKABLE)) return -EINVAL; /* ... and for those we'd better have mountpoint still alive */ if (!parent->mnt_ns) return -EINVAL; } /* Refuse the same filesystem on the same mount point */ if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path->mnt->mnt_root == path->dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; return graft_tree(newmnt, parent, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); /* * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, unsigned int mnt_flags) { struct vfsmount *mnt; struct mountpoint *mp; struct super_block *sb = fc->root->d_sb; int error; error = security_sb_kern_mount(sb); if (!error && mount_too_revealing(sb, &mnt_flags)) error = -EPERM; if (unlikely(error)) { fc_drop_locked(fc); return error; } up_write(&sb->s_umount); mnt = vfs_create_mount(fc); if (IS_ERR(mnt)) return PTR_ERR(mnt); mnt_warn_timestamp_expiry(mountpoint, mnt); mp = lock_mount(mountpoint); if (IS_ERR(mp)) { mntput(mnt); return PTR_ERR(mp); } error = do_add_mount(real_mount(mnt), mp, mountpoint, mnt_flags); unlock_mount(mp); if (error < 0) mntput(mnt); return error; } /* * create a new mount for userspace and request it to be added into the * namespace's tree */ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, int mnt_flags, const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; const char *subtype = NULL; int err = 0; if (!fstype) return -EINVAL; type = get_fs_type(fstype); if (!type) return -ENODEV; if (type->fs_flags & FS_HAS_SUBTYPE) { subtype = strchr(fstype, '.'); if (subtype) { subtype++; if (!*subtype) { put_filesystem(type); return -EINVAL; } } } fc = fs_context_for_mount(type, sb_flags); put_filesystem(type); if (IS_ERR(fc)) return PTR_ERR(fc); if (subtype) err = vfs_parse_fs_string(fc, "subtype", subtype, strlen(subtype)); if (!err && name) err = vfs_parse_fs_string(fc, "source", name, strlen(name)); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; if (!err) err = vfs_get_tree(fc); if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } int finish_automount(struct vfsmount *m, struct path *path) { struct dentry *dentry = path->dentry; struct mountpoint *mp; struct mount *mnt; int err; if (!m) return 0; if (IS_ERR(m)) return PTR_ERR(m); mnt = real_mount(m); /* The new mount record should have at least 2 refs to prevent it being * expired before we get a chance to add it */ BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == dentry) { err = -ELOOP; goto discard; } /* * we don't want to use lock_mount() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ inode_lock(dentry->d_inode); namespace_lock(); if (unlikely(cant_mount(dentry))) { err = -ENOENT; goto discard_locked; } rcu_read_lock(); if (unlikely(__lookup_mnt(path->mnt, dentry))) { rcu_read_unlock(); err = 0; goto discard_locked; } rcu_read_unlock(); mp = get_mountpoint(dentry); if (IS_ERR(mp)) { err = PTR_ERR(mp); goto discard_locked; } err = do_add_mount(mnt, mp, path, path->mnt->mnt_flags | MNT_SHRINKABLE); unlock_mount(mp); if (unlikely(err)) goto discard; mntput(m); return 0; discard_locked: namespace_unlock(); inode_unlock(dentry->d_inode); discard: /* remove m from any expiration list it may be on */ if (!list_empty(&mnt->mnt_expire)) { namespace_lock(); list_del_init(&mnt->mnt_expire); namespace_unlock(); } mntput(m); mntput(m); return err; } /** * mnt_set_expiry - Put a mount on an expiration list * @mnt: The mount to list. * @expiry_list: The list to add the mount to. */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { namespace_lock(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); namespace_unlock(); } EXPORT_SYMBOL(mnt_set_expiry); /* * process a list of expirable mountpoints with the intent of discarding any * mountpoints that aren't in use and haven't been touched since last we came * here */ void mark_mounts_for_expiry(struct list_head *mounts) { struct mount *mnt, *next; LIST_HEAD(graveyard); if (list_empty(mounts)) return; namespace_lock(); lock_mount_hash(); /* extract from the expiration list every vfsmount that matches the * following criteria: * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; list_move(&mnt->mnt_expire, &graveyard); } while (!list_empty(&graveyard)) { mnt = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } unlock_mount_hash(); namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); /* * Ripoff of 'select_parent()' * * search the list of submounts for a given mountpoint, and move any * shrinkable submounts to the 'graveyard' list. */ static int select_submounts(struct mount *parent, struct list_head *graveyard) { struct mount *this_parent = parent; struct list_head *next; int found = 0; repeat: next = this_parent->mnt_mounts.next; resume: while (next != &this_parent->mnt_mounts) { struct list_head *tmp = next; struct mount *mnt = list_entry(tmp, struct mount, mnt_child); next = tmp->next; if (!(mnt->mnt.mnt_flags & MNT_SHRINKABLE)) continue; /* * Descend a level if the d_mounts list is non-empty. */ if (!list_empty(&mnt->mnt_mounts)) { this_parent = mnt; goto repeat; } if (!propagate_mount_busy(mnt, 1)) { list_move_tail(&mnt->mnt_expire, graveyard); found++; } } /* * All done at this level ... ascend and resume the search */ if (this_parent != parent) { next = this_parent->mnt_child.next; this_parent = this_parent->mnt_parent; goto resume; } return found; } /* * process a list of expirable mountpoints with the intent of discarding any * submounts of a specific parent mountpoint * * mount_lock must be held for write */ static void shrink_submounts(struct mount *mnt) { LIST_HEAD(graveyard); struct mount *m; /* extract submounts of 'mountpoint' from the expiration list */ while (select_submounts(mnt, &graveyard)) { while (!list_empty(&graveyard)) { m = list_first_entry(&graveyard, struct mount, mnt_expire); touch_mnt_namespace(m->mnt_ns); umount_tree(m, UMOUNT_PROPAGATE|UMOUNT_SYNC); } } } static void *copy_mount_options(const void __user * data) { char *copy; unsigned left, offset; if (!data) return NULL; copy = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!copy) return ERR_PTR(-ENOMEM); left = copy_from_user(copy, data, PAGE_SIZE); /* * Not all architectures have an exact copy_from_user(). Resort to * byte at a time. */ offset = PAGE_SIZE - left; while (left) { char c; if (get_user(c, (const char __user *)data + offset)) break; copy[offset] = c; left--; offset++; } if (left == PAGE_SIZE) { kfree(copy); return ERR_PTR(-EFAULT); } return copy; } static char *copy_mount_string(const void __user *data) { return data ? strndup_user(data, PATH_MAX) : NULL; } /* * Flags is a 32-bit value that allows up to 31 non-fs dependent flags to * be given to the mount() call (ie: read-only, no-dev, no-suid etc). * * data is a (void *) that can point to any structure up to * PAGE_SIZE-1 bytes, which can contain arbitrary fs-dependent * information (or be NULL). * * Pre-0.97 versions of mount() didn't have a flags word. * When the flags word was introduced its top half was required * to have the magic value 0xC0ED, and this remained so until 2.4.0-test9. * Therefore, if this magic number is present, it carries no information * and must be discarded. */ int path_mount(const char *dev_name, struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; int ret; /* Discard magic */ if ((flags & MS_MGC_MSK) == MS_MGC_VAL) flags &= ~MS_MGC_MSK; /* Basic sanity checks */ if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; if (flags & MS_NOUSER) return -EINVAL; ret = security_sb_mount(dev_name, path, type_page, flags, data_page); if (ret) return ret; if (!may_mount()) return -EPERM; if ((flags & SB_MANDLOCK) && !may_mandlock()) return -EPERM; /* Default to relatime unless overriden */ if (!(flags & MS_NOATIME)) mnt_flags |= MNT_RELATIME; /* Separate the per-mountpoint flags */ if (flags & MS_NOSUID) mnt_flags |= MNT_NOSUID; if (flags & MS_NODEV) mnt_flags |= MNT_NODEV; if (flags & MS_NOEXEC) mnt_flags |= MNT_NOEXEC; if (flags & MS_NOATIME) mnt_flags |= MNT_NOATIME; if (flags & MS_NODIRATIME) mnt_flags |= MNT_NODIRATIME; if (flags & MS_STRICTATIME) mnt_flags &= ~(MNT_RELATIME | MNT_NOATIME); if (flags & MS_RDONLY) mnt_flags |= MNT_READONLY; if (flags & MS_NOSYMFOLLOW) mnt_flags |= MNT_NOSYMFOLLOW; /* The default atime for remount is preservation */ if ((flags & MS_REMOUNT) && ((flags & (MS_NOATIME | MS_NODIRATIME | MS_RELATIME | MS_STRICTATIME)) == 0)) { mnt_flags &= ~MNT_ATIME_MASK; mnt_flags |= path->mnt->mnt_flags & MNT_ATIME_MASK; } sb_flags = flags & (SB_RDONLY | SB_SYNCHRONOUS | SB_MANDLOCK | SB_DIRSYNC | SB_SILENT | SB_POSIXACL | SB_LAZYTIME | SB_I_VERSION); if ((flags & (MS_REMOUNT | MS_BIND)) == (MS_REMOUNT | MS_BIND)) return do_reconfigure_mnt(path, mnt_flags); if (flags & MS_REMOUNT) return do_remount(path, flags, sb_flags, mnt_flags, data_page); if (flags & MS_BIND) return do_loopback(path, dev_name, flags & MS_REC); if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) return do_change_type(path, flags); if (flags & MS_MOVE) return do_move_mount_old(path, dev_name); return do_new_mount(path, type_page, sb_flags, mnt_flags, dev_name, data_page); } long do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; ret = path_mount(dev_name, &path, type_page, flags, data_page); path_put(&path); return ret; } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) { return inc_ucount(ns, current_euid(), UCOUNT_MNT_NAMESPACES); } static void dec_mnt_namespaces(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_MNT_NAMESPACES); } static void free_mnt_ns(struct mnt_namespace *ns) { if (!is_anon_ns(ns)) ns_free_inum(&ns->ns); dec_mnt_namespaces(ns->ucounts); put_user_ns(ns->user_ns); kfree(ns); } /* * Assign a sequence number so we can detect when we attempt to bind * mount a reference to an older mount namespace into the current * mount namespace, preventing reference counting loops. A 64bit * number incrementing at 10Ghz will take 12,427 years to wrap which * is effectively never, so we can ignore the possibility. */ static atomic64_t mnt_ns_seq = ATOMIC64_INIT(1); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *user_ns, bool anon) { struct mnt_namespace *new_ns; struct ucounts *ucounts; int ret; ucounts = inc_mnt_namespaces(user_ns); if (!ucounts) return ERR_PTR(-ENOSPC); new_ns = kzalloc(sizeof(struct mnt_namespace), GFP_KERNEL); if (!new_ns) { dec_mnt_namespaces(ucounts); return ERR_PTR(-ENOMEM); } if (!anon) { ret = ns_alloc_inum(&new_ns->ns); if (ret) { kfree(new_ns); dec_mnt_namespaces(ucounts); return ERR_PTR(ret); } } new_ns->ns.ops = &mntns_operations; if (!anon) new_ns->seq = atomic64_add_return(1, &mnt_ns_seq); atomic_set(&new_ns->count, 1); INIT_LIST_HEAD(&new_ns->list); init_waitqueue_head(&new_ns->poll); spin_lock_init(&new_ns->ns_lock); new_ns->user_ns = get_user_ns(user_ns); new_ns->ucounts = ucounts; return new_ns; } __latent_entropy struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; struct mount *p, *q; struct mount *old; struct mount *new; int copy_flags; BUG_ON(!ns); if (likely(!(flags & CLONE_NEWNS))) { get_mnt_ns(ns); return ns; } old = ns->root; new_ns = alloc_mnt_ns(user_ns, false); if (IS_ERR(new_ns)) return new_ns; namespace_lock(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SHARED_TO_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { namespace_unlock(); free_mnt_ns(new_ns); return ERR_CAST(new); } if (user_ns != ns->user_ns) { lock_mount_hash(); lock_mnt_tree(new); unlock_mount_hash(); } new_ns->root = new; list_add_tail(&new_ns->list, &new->mnt_list); /* * Second pass: switch the tsk->fs->* elements and mark new vfsmounts * as belonging to new namespace. We have already acquired a private * fs_struct, so tsk->fs->lock is not needed. */ p = old; q = new; while (p) { q->mnt_ns = new_ns; new_ns->mounts++; if (new_fs) { if (&p->mnt == new_fs->root.mnt) { new_fs->root.mnt = mntget(&q->mnt); rootmnt = &p->mnt; } if (&p->mnt == new_fs->pwd.mnt) { new_fs->pwd.mnt = mntget(&q->mnt); pwdmnt = &p->mnt; } } p = next_mnt(p, old); q = next_mnt(q, new); if (!q) break; while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(p, old); } namespace_unlock(); if (rootmnt) mntput(rootmnt); if (pwdmnt) mntput(pwdmnt); return new_ns; } struct dentry *mount_subtree(struct vfsmount *m, const char *name) { struct mount *mnt = real_mount(m); struct mnt_namespace *ns; struct super_block *s; struct path path; int err; ns = alloc_mnt_ns(&init_user_ns, true); if (IS_ERR(ns)) { mntput(m); return ERR_CAST(ns); } mnt->mnt_ns = ns; ns->root = mnt; ns->mounts++; list_add(&mnt->mnt_list, &ns->list); err = vfs_path_lookup(m->mnt_root, m, name, LOOKUP_FOLLOW|LOOKUP_AUTOMOUNT, &path); put_mnt_ns(ns); if (err) return ERR_PTR(err); /* trade a vfsmount reference for active sb one */ s = path.mnt->mnt_sb; atomic_inc(&s->s_active); mntput(path.mnt); /* lock the sucker */ down_write(&s->s_umount); /* ... and return the root of (sub)tree on it */ return path.dentry; } EXPORT_SYMBOL(mount_subtree); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) { int ret; char *kernel_type; char *kernel_dev; void *options; kernel_type = copy_mount_string(type); ret = PTR_ERR(kernel_type); if (IS_ERR(kernel_type)) goto out_type; kernel_dev = copy_mount_string(dev_name); ret = PTR_ERR(kernel_dev); if (IS_ERR(kernel_dev)) goto out_dev; options = copy_mount_options(data); ret = PTR_ERR(options); if (IS_ERR(options)) goto out_data; ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); kfree(options); out_data: kfree(kernel_dev); out_dev: kfree(kernel_type); out_type: return ret; } /* * Create a kernel mount representation for a new, prepared superblock * (specified by fs_fd) and attach to an open_tree-like file descriptor. */ SYSCALL_DEFINE3(fsmount, int, fs_fd, unsigned int, flags, unsigned int, attr_flags) { struct mnt_namespace *ns; struct fs_context *fc; struct file *file; struct path newmount; struct mount *mnt; struct fd f; unsigned int mnt_flags = 0; long ret; if (!may_mount()) return -EPERM; if ((flags & ~(FSMOUNT_CLOEXEC)) != 0) return -EINVAL; if (attr_flags & ~(MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID | MOUNT_ATTR_NODEV | MOUNT_ATTR_NOEXEC | MOUNT_ATTR__ATIME | MOUNT_ATTR_NODIRATIME)) return -EINVAL; if (attr_flags & MOUNT_ATTR_RDONLY) mnt_flags |= MNT_READONLY; if (attr_flags & MOUNT_ATTR_NOSUID) mnt_flags |= MNT_NOSUID; if (attr_flags & MOUNT_ATTR_NODEV) mnt_flags |= MNT_NODEV; if (attr_flags & MOUNT_ATTR_NOEXEC) mnt_flags |= MNT_NOEXEC; if (attr_flags & MOUNT_ATTR_NODIRATIME) mnt_flags |= MNT_NODIRATIME; switch (attr_flags & MOUNT_ATTR__ATIME) { case MOUNT_ATTR_STRICTATIME: break; case MOUNT_ATTR_NOATIME: mnt_flags |= MNT_NOATIME; break; case MOUNT_ATTR_RELATIME: mnt_flags |= MNT_RELATIME; break; default: return -EINVAL; } f = fdget(fs_fd); if (!f.file) return -EBADF; ret = -EINVAL; if (f.file->f_op != &fscontext_fops) goto err_fsfd; fc = f.file->private_data; ret = mutex_lock_interruptible(&fc->uapi_mutex); if (ret < 0) goto err_fsfd; /* There must be a valid superblock or we can't mount it */ ret = -EINVAL; if (!fc->root) goto err_unlock; ret = -EPERM; if (mount_too_revealing(fc->root->d_sb, &mnt_flags)) { pr_warn("VFS: Mount too revealing\n"); goto err_unlock; } ret = -EBUSY; if (fc->phase != FS_CONTEXT_AWAITING_MOUNT) goto err_unlock; ret = -EPERM; if ((fc->sb_flags & SB_MANDLOCK) && !may_mandlock()) goto err_unlock; newmount.mnt = vfs_create_mount(fc); if (IS_ERR(newmount.mnt)) { ret = PTR_ERR(newmount.mnt); goto err_unlock; } newmount.dentry = dget(fc->root); newmount.mnt->mnt_flags = mnt_flags; /* We've done the mount bit - now move the file context into more or * less the same state as if we'd done an fspick(). We don't want to * do any memory allocation or anything like that at this point as we * don't want to have to handle any errors incurred. */ vfs_clean_context(fc); ns = alloc_mnt_ns(current->nsproxy->mnt_ns->user_ns, true); if (IS_ERR(ns)) { ret = PTR_ERR(ns); goto err_path; } mnt = real_mount(newmount.mnt); mnt->mnt_ns = ns; ns->root = mnt; ns->mounts = 1; list_add(&mnt->mnt_list, &ns->list); mntget(newmount.mnt); /* Attach to an apparent O_PATH fd with a note that we need to unmount * it, not just simply put it. */ file = dentry_open(&newmount, O_PATH, fc->cred); if (IS_ERR(file)) { dissolve_on_fput(newmount.mnt); ret = PTR_ERR(file); goto err_path; } file->f_mode |= FMODE_NEED_UNMOUNT; ret = get_unused_fd_flags((flags & FSMOUNT_CLOEXEC) ? O_CLOEXEC : 0); if (ret >= 0) fd_install(ret, file); else fput(file); err_path: path_put(&newmount); err_unlock: mutex_unlock(&fc->uapi_mutex); err_fsfd: fdput(f); return ret; } /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination * with open_tree(OPEN_TREE_CLONE [| AT_RECURSIVE]) it can be used to copy * a mount subtree. * * Note the flags value is a combination of MOVE_MOUNT_* flags. */ SYSCALL_DEFINE5(move_mount, int, from_dfd, const char __user *, from_pathname, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { struct path from_path, to_path; unsigned int lflags; int ret = 0; if (!may_mount()) return -EPERM; if (flags & ~MOVE_MOUNT__MASK) return -EINVAL; /* If someone gives a pathname, they aren't permitted to move * from an fd that requires unmount as we can't get at the flag * to clear it afterwards. */ lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); if (ret < 0) return ret; lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); if (ret < 0) goto out_from; ret = security_move_mount(&from_path, &to_path); if (ret < 0) goto out_to; ret = do_move_mount(&from_path, &to_path); out_to: path_put(&to_path); out_from: path_put(&from_path); return ret; } /* * Return true if path is reachable from root * * namespace_sem or mount_lock is held */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) { while (&mnt->mnt != root->mnt && mnt_has_parent(mnt)) { dentry = mnt->mnt_mountpoint; mnt = mnt->mnt_parent; } return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } bool path_is_under(const struct path *path1, const struct path *path2) { bool res; read_seqlock_excl(&mount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); read_sequnlock_excl(&mount_lock); return res; } EXPORT_SYMBOL(path_is_under); /* * pivot_root Semantics: * Moves the root file system of the current process to the directory put_old, * makes new_root as the new root file system of the current process, and sets * root/cwd of all processes which had them on the current root to new_root. * * Restrictions: * The new_root and put_old must be directories, and must not be on the * same file system as the current process root. The put_old must be * underneath new_root, i.e. adding a non-zero number of /.. to the string * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: * - we don't move root/cwd if they are not at the root (reason: if something * cared enough to change them, it's probably wrong to force them elsewhere) * - it's okay to pick a root that isn't the root of a file system, e.g. * /nfs/my_root where /nfs is the mount point. It must be a mountpoint, * though, so you may need to say mount --bind /nfs/my_root /nfs/my_root * first. */ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { struct path new, old, root; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; struct mountpoint *old_mp, *root_mp; int error; if (!may_mount()) return -EPERM; error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) goto out0; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) goto out1; error = security_sb_pivotroot(&old, &new); if (error) goto out2; get_fs_root(current->fs, &root); old_mp = lock_mount(&old); error = PTR_ERR(old_mp); if (IS_ERR(old_mp)) goto out3; error = -EINVAL; new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); old_mnt = real_mount(old.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) goto out4; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) goto out4; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) goto out4; error = -ENOENT; if (d_unlinked(new.dentry)) goto out4; error = -EBUSY; if (new_mnt == root_mnt || old_mnt == root_mnt) goto out4; /* loop, on the same file system */ error = -EINVAL; if (root.mnt->mnt_root != root.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) goto out4; /* not attached */ if (new.mnt->mnt_root != new.dentry) goto out4; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) goto out4; /* not attached */ /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) goto out4; lock_mount_hash(); umount_mnt(new_mnt); root_mp = unhash_mnt(root_mnt); /* we'll need its mountpoint */ if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { new_mnt->mnt.mnt_flags |= MNT_LOCKED; root_mnt->mnt.mnt_flags &= ~MNT_LOCKED; } /* mount old root on put_old */ attach_mnt(root_mnt, old_mnt, old_mp); /* mount new_root on / */ attach_mnt(new_mnt, root_parent, root_mp); mnt_add_count(root_parent, -1); touch_mnt_namespace(current->nsproxy->mnt_ns); /* A moved mount should not expire automatically */ list_del_init(&new_mnt->mnt_expire); put_mountpoint(root_mp); unlock_mount_hash(); chroot_fs_refs(&root, &new); error = 0; out4: unlock_mount(old_mp); if (!error) mntput_no_expire(ex_parent); out3: path_put(&root); out2: path_put(&old); out1: path_put(&new); out0: return error; } static void __init init_mount_tree(void) { struct vfsmount *mnt; struct mount *m; struct mnt_namespace *ns; struct path root; mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); if (IS_ERR(mnt)) panic("Can't create rootfs"); ns = alloc_mnt_ns(&init_user_ns, false); if (IS_ERR(ns)) panic("Can't allocate initial namespace"); m = real_mount(mnt); m->mnt_ns = ns; ns->root = m; ns->mounts = 1; list_add(&m->mnt_list, &ns->list); init_task.nsproxy->mnt_ns = ns; get_mnt_ns(ns); root.mnt = mnt; root.dentry = mnt->mnt_root; mnt->mnt_flags |= MNT_LOCKED; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); } void __init mnt_init(void) { int err; mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); mount_hashtable = alloc_large_system_hash("Mount-cache", sizeof(struct hlist_head), mhash_entries, 19, HASH_ZERO, &m_hash_shift, &m_hash_mask, 0, 0); mountpoint_hashtable = alloc_large_system_hash("Mountpoint-cache", sizeof(struct hlist_head), mphash_entries, 19, HASH_ZERO, &mp_hash_shift, &mp_hash_mask, 0, 0); if (!mount_hashtable || !mountpoint_hashtable) panic("Failed to allocate mount hash table\n"); kernfs_init(); err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) printk(KERN_WARNING "%s: kobj create error\n", __func__); shmem_init(); init_rootfs(); init_mount_tree(); } void put_mnt_ns(struct mnt_namespace *ns) { if (!atomic_dec_and_test(&ns->count)) return; drop_collected_mounts(&ns->root->mnt); free_mnt_ns(ns); } struct vfsmount *kern_mount(struct file_system_type *type) { struct vfsmount *mnt; mnt = vfs_kern_mount(type, SB_KERNMOUNT, type->name, NULL); if (!IS_ERR(mnt)) { /* * it is a longterm mount, don't release mnt until * we unmount before file sys is unregistered */ real_mount(mnt)->mnt_ns = MNT_NS_INTERNAL; } return mnt; } EXPORT_SYMBOL_GPL(kern_mount); void kern_unmount(struct vfsmount *mnt) { /* release long term mount so mount point can be released */ if (!IS_ERR_OR_NULL(mnt)) { real_mount(mnt)->mnt_ns = NULL; synchronize_rcu(); /* yecchhh... */ mntput(mnt); } } EXPORT_SYMBOL(kern_unmount); void kern_unmount_array(struct vfsmount *mnt[], unsigned int num) { unsigned int i; for (i = 0; i < num; i++) if (mnt[i]) real_mount(mnt[i])->mnt_ns = NULL; synchronize_rcu_expedited(); for (i = 0; i < num; i++) mntput(mnt[i]); } EXPORT_SYMBOL(kern_unmount_array); bool our_mnt(struct vfsmount *mnt) { return check_mnt(real_mount(mnt)); } bool current_chrooted(void) { /* Does the current process have a non-standard root */ struct path ns_root; struct path fs_root; bool chrooted; /* Find the namespace root */ ns_root.mnt = &current->nsproxy->mnt_ns->root->mnt; ns_root.dentry = ns_root.mnt->mnt_root; path_get(&ns_root); while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) ; get_fs_root(current->fs, &fs_root); chrooted = !path_equal(&fs_root, &ns_root); path_put(&fs_root); path_put(&ns_root); return chrooted; } static bool mnt_already_visible(struct mnt_namespace *ns, const struct super_block *sb, int *new_mnt_flags) { int new_flags = *new_mnt_flags; struct mount *mnt; bool visible = false; down_read(&namespace_sem); lock_ns_list(ns); list_for_each_entry(mnt, &ns->list, mnt_list) { struct mount *child; int mnt_flags; if (mnt_is_cursor(mnt)) continue; if (mnt->mnt.mnt_sb->s_type != sb->s_type) continue; /* This mount is not fully visible if it's root directory * is not the root directory of the filesystem. */ if (mnt->mnt.mnt_root != mnt->mnt.mnt_sb->s_root) continue; /* A local view of the mount flags */ mnt_flags = mnt->mnt.mnt_flags; /* Don't miss readonly hidden in the superblock flags */ if (sb_rdonly(mnt->mnt.mnt_sb)) mnt_flags |= MNT_LOCK_READONLY; /* Verify the mount flags are equal to or more permissive * than the proposed new mount. */ if ((mnt_flags & MNT_LOCK_READONLY) && !(new_flags & MNT_READONLY)) continue; if ((mnt_flags & MNT_LOCK_ATIME) && ((mnt_flags & MNT_ATIME_MASK) != (new_flags & MNT_ATIME_MASK))) continue; /* This mount is not fully visible if there are any * locked child mounts that cover anything except for * empty directories. */ list_for_each_entry(child, &mnt->mnt_mounts, mnt_child) { struct inode *inode = child->mnt_mountpoint->d_inode; /* Only worry about locked mounts */ if (!(child->mnt.mnt_flags & MNT_LOCKED)) continue; /* Is the directory permanetly empty? */ if (!is_empty_dir_inode(inode)) goto next; } /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); visible = true; goto found; next: ; } found: unlock_ns_list(ns); up_read(&namespace_sem); return visible; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) { const unsigned long required_iflags = SB_I_NOEXEC | SB_I_NODEV; struct mnt_namespace *ns = current->nsproxy->mnt_ns; unsigned long s_iflags; if (ns->user_ns == &init_user_ns) return false; /* Can this filesystem be too revealing? */ s_iflags = sb->s_iflags; if (!(s_iflags & SB_I_USERNS_VISIBLE)) return false; if ((s_iflags & required_iflags) != required_iflags) { WARN_ONCE(1, "Expected s_iflags to contain 0x%lx\n", required_iflags); return true; } return !mnt_already_visible(ns, sb, new_mnt_flags); } bool mnt_may_suid(struct vfsmount *mnt) { /* * Foreign mounts (accessed via fchdir or through /proc * symlinks) are always treated as if they are nosuid. This * prevents namespaces from trusting potentially unsafe * suid/sgid bits, file caps, or security labels that originate * in other namespaces. */ return !(mnt->mnt_flags & MNT_NOSUID) && check_mnt(real_mount(mnt)) && current_in_userns(mnt->mnt_sb->s_user_ns); } static struct ns_common *mntns_get(struct task_struct *task) { struct ns_common *ns = NULL; struct nsproxy *nsproxy; task_lock(task); nsproxy = task->nsproxy; if (nsproxy) { ns = &nsproxy->mnt_ns->ns; get_mnt_ns(to_mnt_ns(ns)); } task_unlock(task); return ns; } static void mntns_put(struct ns_common *ns) { put_mnt_ns(to_mnt_ns(ns)); } static int mntns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct fs_struct *fs = nsset->fs; struct mnt_namespace *mnt_ns = to_mnt_ns(ns), *old_mnt_ns; struct user_namespace *user_ns = nsset->cred->user_ns; struct path root; int err; if (!ns_capable(mnt_ns->user_ns, CAP_SYS_ADMIN) || !ns_capable(user_ns, CAP_SYS_CHROOT) || !ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; if (is_anon_ns(mnt_ns)) return -EINVAL; if (fs->users != 1) return -EINVAL; get_mnt_ns(mnt_ns); old_mnt_ns = nsproxy->mnt_ns; nsproxy->mnt_ns = mnt_ns; /* Find the root */ err = vfs_path_lookup(mnt_ns->root->mnt.mnt_root, &mnt_ns->root->mnt, "/", LOOKUP_DOWN, &root); if (err) { /* revert to old namespace */ nsproxy->mnt_ns = old_mnt_ns; put_mnt_ns(mnt_ns); return err; } put_mnt_ns(old_mnt_ns); /* Update the pwd and root */ set_fs_pwd(fs, &root); set_fs_root(fs, &root); path_put(&root); return 0; } static struct user_namespace *mntns_owner(struct ns_common *ns) { return to_mnt_ns(ns)->user_ns; } const struct proc_ns_operations mntns_operations = { .name = "mnt", .type = CLONE_NEWNS, .get = mntns_get, .put = mntns_put, .install = mntns_install, .owner = mntns_owner, };
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Universal TUN/TAP device driver. * Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com> */ #ifndef __IF_TUN_H #define __IF_TUN_H #include <uapi/linux/if_tun.h> #include <uapi/linux/virtio_net.h> #define TUN_XDP_FLAG 0x1UL #define TUN_MSG_UBUF 1 #define TUN_MSG_PTR 2 struct tun_msg_ctl { unsigned short type; unsigned short num; void *ptr; }; struct tun_xdp_hdr { int buflen; struct virtio_net_hdr gso; }; #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); static inline bool tun_is_xdp_frame(void *ptr) { return (unsigned long)ptr & TUN_XDP_FLAG; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return (void *)((unsigned long)xdp | TUN_XDP_FLAG); } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); } void tun_ptr_free(void *ptr); #else #include <linux/err.h> #include <linux/errno.h> struct file; struct socket; static inline struct socket *tun_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } static inline bool tun_is_xdp_frame(void *ptr) { return false; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } static inline void tun_ptr_free(void *ptr) { } #endif /* CONFIG_TUN */ #endif /* __IF_TUN_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 // SPDX-License-Identifier: GPL-2.0 /* * linux/kernel/capability.c * * Copyright (C) 1997 Andrew Main <zefram@fysh.org> * * Integrated into 2.1.97+, Andrew G. Morgan <morgan@kernel.org> * 30 May 2002: Cleanup, Robert M. Love <rml@tech9.net> */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/audit.h> #include <linux/capability.h> #include <linux/mm.h> #include <linux/export.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/pid_namespace.h> #include <linux/user_namespace.h> #include <linux/uaccess.h> /* * Leveraged for setting/resetting capabilities */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; static int __init file_caps_disable(char *str) { file_caps_enabled = 0; return 1; } __setup("no_file_caps", file_caps_disable); #ifdef CONFIG_MULTIUSER /* * More recent versions of libcap are available from: * * http://www.kernel.org/pub/linux/libs/security/linux-privs/ */ static void warn_legacy_capability_use(void) { char name[sizeof(current->comm)]; pr_info_once("warning: `%s' uses 32-bit capabilities (legacy support in use)\n", get_task_comm(name, current)); } /* * Version 2 capabilities worked fine, but the linux/capability.h file * that accompanied their introduction encouraged their use without * the necessary user-space source code changes. As such, we have * created a version 3 with equivalent functionality to version 2, but * with a header change to protect legacy source code from using * version 2 when it wanted to use version 1. If your system has code * that trips the following warning, it is using version 2 specific * capabilities and may be doing so insecurely. * * The remedy is to either upgrade your version of libcap (to 2.10+, * if the application is linked against it), or recompile your * application with modern kernel headers and this warning will go * away. */ static void warn_deprecated_v2(void) { char name[sizeof(current->comm)]; pr_info_once("warning: `%s' uses deprecated v2 capabilities in a way that may be insecure\n", get_task_comm(name, current)); } /* * Version check. Return the number of u32s in each capability flag * array, or a negative value on error. */ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy) { __u32 version; if (get_user(version, &header->version)) return -EFAULT; switch (version) { case _LINUX_CAPABILITY_VERSION_1: warn_legacy_capability_use(); *tocopy = _LINUX_CAPABILITY_U32S_1; break; case _LINUX_CAPABILITY_VERSION_2: warn_deprecated_v2(); fallthrough; /* v3 is otherwise equivalent to v2 */ case _LINUX_CAPABILITY_VERSION_3: *tocopy = _LINUX_CAPABILITY_U32S_3; break; default: if (put_user((u32)_KERNEL_CAPABILITY_VERSION, &header->version)) return -EFAULT; return -EINVAL; } return 0; } /* * The only thing that can change the capabilities of the current * process is the current process. As such, we can't be in this code * at the same time as we are in the process of setting capabilities * in this process. The net result is that we can limit our use of * locks to when we are reading the caps of another process. */ static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp, kernel_cap_t *pIp, kernel_cap_t *pPp) { int ret; if (pid && (pid != task_pid_vnr(current))) { struct task_struct *target; rcu_read_lock(); target = find_task_by_vpid(pid); if (!target) ret = -ESRCH; else ret = security_capget(target, pEp, pIp, pPp); rcu_read_unlock(); } else ret = security_capget(current, pEp, pIp, pPp); return ret; } /** * sys_capget - get the capabilities of a given process. * @header: pointer to struct that contains capability version and * target pid data * @dataptr: pointer to struct that contains the effective, permitted, * and inheritable capabilities that are returned * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) { int ret = 0; pid_t pid; unsigned tocopy; kernel_cap_t pE, pI, pP; ret = cap_validate_magic(header, &tocopy); if ((dataptr == NULL) || (ret != 0)) return ((dataptr == NULL) && (ret == -EINVAL)) ? 0 : ret; if (get_user(pid, &header->pid)) return -EFAULT; if (pid < 0) return -EINVAL; ret = cap_get_target_pid(pid, &pE, &pI, &pP); if (!ret) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i; for (i = 0; i < tocopy; i++) { kdata[i].effective = pE.cap[i]; kdata[i].permitted = pP.cap[i]; kdata[i].inheritable = pI.cap[i]; } /* * Note, in the case, tocopy < _KERNEL_CAPABILITY_U32S, * we silently drop the upper capabilities here. This * has the effect of making older libcap * implementations implicitly drop upper capability * bits when they perform a: capget/modify/capset * sequence. * * This behavior is considered fail-safe * behavior. Upgrading the application to a newer * version of libcap will enable access to the newer * capabilities. * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ if (copy_to_user(dataptr, kdata, tocopy * sizeof(struct __user_cap_data_struct))) { return -EFAULT; } } return ret; } /** * sys_capset - set capabilities for a process or (*) a group of processes * @header: pointer to struct that contains capability version and * target pid data * @data: pointer to struct that contains the effective, permitted, * and inheritable capabilities * * Set capabilities for the current process only. The ability to any other * process(es) has been deprecated and removed. * * The restrictions on setting capabilities are specified as: * * I: any raised capabilities must be a subset of the old permitted * P: any raised capabilities must be a subset of the old permitted * E: must be set to a subset of new permitted * * Returns 0 on success and < 0 on error. */ SYSCALL_DEFINE2(capset, cap_user_header_t, header, const cap_user_data_t, data) { struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; unsigned i, tocopy, copybytes; kernel_cap_t inheritable, permitted, effective; struct cred *new; int ret; pid_t pid; ret = cap_validate_magic(header, &tocopy); if (ret != 0) return ret; if (get_user(pid, &header->pid)) return -EFAULT; /* may only affect current now */ if (pid != 0 && pid != task_pid_vnr(current)) return -EPERM; copybytes = tocopy * sizeof(struct __user_cap_data_struct); if (copybytes > sizeof(kdata)) return -EFAULT; if (copy_from_user(&kdata, data, copybytes)) return -EFAULT; for (i = 0; i < tocopy; i++) { effective.cap[i] = kdata[i].effective; permitted.cap[i] = kdata[i].permitted; inheritable.cap[i] = kdata[i].inheritable; } while (i < _KERNEL_CAPABILITY_U32S) { effective.cap[i] = 0; permitted.cap[i] = 0; inheritable.cap[i] = 0; i++; } effective.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; permitted.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; inheritable.cap[CAP_LAST_U32] &= CAP_LAST_U32_VALID_MASK; new = prepare_creds(); if (!new) return -ENOMEM; ret = security_capset(new, current_cred(), &effective, &inheritable, &permitted); if (ret < 0) goto error; audit_log_capset(new, current_cred()); return commit_creds(new); error: abort_creds(new); return ret; } /** * has_ns_capability - Does a task have a capability in a specific user ns * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NONE); rcu_read_unlock(); return (ret == 0); } /** * has_capability - Does a task have a capability in init_user_ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the initial user namespace, false if not. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability(struct task_struct *t, int cap) { return has_ns_capability(t, &init_user_ns, cap); } EXPORT_SYMBOL(has_capability); /** * has_ns_capability_noaudit - Does a task have a capability (unaudited) * in a specific user ns. * @t: The task in question * @ns: target user namespace * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to the specified user namespace, false if not. * Do not write an audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_ns_capability_noaudit(struct task_struct *t, struct user_namespace *ns, int cap) { int ret; rcu_read_lock(); ret = security_capable(__task_cred(t), ns, cap, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); } /** * has_capability_noaudit - Does a task have a capability (unaudited) in the * initial user ns * @t: The task in question * @cap: The capability to be tested for * * Return true if the specified task has the given superior capability * currently in effect to init_user_ns, false if not. Don't write an * audit message for the check. * * Note that this does not set PF_SUPERPRIV on the task. */ bool has_capability_noaudit(struct task_struct *t, int cap) { return has_ns_capability_noaudit(t, &init_user_ns, cap); } static bool ns_capable_common(struct user_namespace *ns, int cap, unsigned int opts) { int capable; if (unlikely(!cap_valid(cap))) { pr_crit("capable() called with invalid cap=%u\n", cap); BUG(); } capable = security_capable(current_cred(), ns, cap, opts); if (capable == 0) { current->flags |= PF_SUPERPRIV; return true; } return false; } /** * ns_capable - Determine if the current task has a superior capability in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NONE); } EXPORT_SYMBOL(ns_capable); /** * ns_capable_noaudit - Determine if the current task has a superior capability * (unaudited) in effect * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_noaudit(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_NOAUDIT); } EXPORT_SYMBOL(ns_capable_noaudit); /** * ns_capable_setid - Determine if the current task has a superior capability * in effect, while signalling that this check is being done from within a * setid or setgroups syscall. * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool ns_capable_setid(struct user_namespace *ns, int cap) { return ns_capable_common(ns, cap, CAP_OPT_INSETID); } EXPORT_SYMBOL(ns_capable_setid); /** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * * Return true if the current task has the given superior capability currently * available for use, false if not. * * This sets PF_SUPERPRIV on the task if the capability is available on the * assumption that it's about to be used. */ bool capable(int cap) { return ns_capable(&init_user_ns, cap); } EXPORT_SYMBOL(capable); #endif /* CONFIG_MULTIUSER */ /** * file_ns_capable - Determine if the file's opener had a capability in effect * @file: The file we want to check * @ns: The usernamespace we want the capability in * @cap: The capability to be tested for * * Return true if task that opened the file had a capability in effect * when the file was opened. * * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; if (security_capable(file->f_cred, ns, cap, CAP_OPT_NONE) == 0) return true; return false; } EXPORT_SYMBOL(file_ns_capable); /** * privileged_wrt_inode_uidgid - Do capabilities in the namespace work over the inode? * @ns: The user namespace in question * @inode: The inode in question * * Return true if the inode uid and gid are within the namespace. */ bool privileged_wrt_inode_uidgid(struct user_namespace *ns, const struct inode *inode) { return kuid_has_mapping(ns, inode->i_uid) && kgid_has_mapping(ns, inode->i_gid); } /** * capable_wrt_inode_uidgid - Check nsown_capable and uid and gid mapped * @inode: The inode in question * @cap: The capability in question * * Return true if the current task has the given capability targeted at * its own user namespace and that the given inode's uid and gid are * mapped into the current user namespace. */ bool capable_wrt_inode_uidgid(const struct inode *inode, int cap) { struct user_namespace *ns = current_user_ns(); return ns_capable(ns, cap) && privileged_wrt_inode_uidgid(ns, inode); } EXPORT_SYMBOL(capable_wrt_inode_uidgid); /** * ptracer_capable - Determine if the ptracer holds CAP_SYS_PTRACE in the namespace * @tsk: The task that may be ptraced * @ns: The user namespace to search for CAP_SYS_PTRACE in * * Return true if the task that is ptracing the current task had CAP_SYS_PTRACE * in the specified user namespace. */ bool ptracer_capable(struct task_struct *tsk, struct user_namespace *ns) { int ret = 0; /* An absent tracer adds no restrictions */ const struct cred *cred; rcu_read_lock(); cred = rcu_dereference(tsk->ptracer_cred); if (cred) ret = security_capable(cred, ns, CAP_SYS_PTRACE, CAP_OPT_NOAUDIT); rcu_read_unlock(); return (ret == 0); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_CPUMASK_H #define __LINUX_CPUMASK_H /* * Cpumasks provide a bitmap suitable for representing the * set of CPU's in a system, one bit position per CPU number. In general, * only nr_cpu_ids (<= NR_CPUS) bits are valid. */ #include <linux/kernel.h> #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/atomic.h> #include <linux/bug.h> /* Don't assign or return these: may not be this big! */ typedef struct cpumask { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; /** * cpumask_bits - get the bits in a cpumask * @maskp: the struct cpumask * * * You should only assume nr_cpu_ids bits of this mask are valid. This is * a macro so it's const-correct. */ #define cpumask_bits(maskp) ((maskp)->bits) /** * cpumask_pr_args - printf args to output a cpumask * @maskp: cpumask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a cpumask. */ #define cpumask_pr_args(maskp) nr_cpu_ids, cpumask_bits(maskp) #if NR_CPUS == 1 #define nr_cpu_ids 1U #else extern unsigned int nr_cpu_ids; #endif #ifdef CONFIG_CPUMASK_OFFSTACK /* Assuming NR_CPUS is huge, a runtime limit is more efficient. Also, * not all bits may be allocated. */ #define nr_cpumask_bits nr_cpu_ids #else #define nr_cpumask_bits ((unsigned int)NR_CPUS) #endif /* * The following particular system cpumasks and operations manage * possible, present, active and online cpus. * * cpu_possible_mask- has bit 'cpu' set iff cpu is populatable * cpu_present_mask - has bit 'cpu' set iff cpu is populated * cpu_online_mask - has bit 'cpu' set iff cpu available to scheduler * cpu_active_mask - has bit 'cpu' set iff cpu available to migration * * If !CONFIG_HOTPLUG_CPU, present == possible, and active == online. * * The cpu_possible_mask is fixed at boot time, as the set of CPU id's * that it is possible might ever be plugged in at anytime during the * life of that system boot. The cpu_present_mask is dynamic(*), * representing which CPUs are currently plugged in. And * cpu_online_mask is the dynamic subset of cpu_present_mask, * indicating those CPUs available for scheduling. * * If HOTPLUG is enabled, then cpu_possible_mask is forced to have * all NR_CPUS bits set, otherwise it is just the set of CPUs that * ACPI reports present at boot. * * If HOTPLUG is enabled, then cpu_present_mask varies dynamically, * depending on what ACPI reports as currently plugged in, otherwise * cpu_present_mask is just a copy of cpu_possible_mask. * * (*) Well, cpu_present_mask is dynamic in the hotplug case. If not * hotplug, it's a copy of cpu_possible_mask, hence fixed at boot. * * Subtleties: * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode * assumption that their single CPU is online. The UP * cpu_{online,possible,present}_masks are placebos. Changing them * will have no useful affect on the following num_*_cpus() * and cpu_*() macros in the UP case. This ugliness is a UP * optimization - don't waste any instructions or memory references * asking if you're online or how many CPUs there are if there is * only one CPU. */ extern struct cpumask __cpu_possible_mask; extern struct cpumask __cpu_online_mask; extern struct cpumask __cpu_present_mask; extern struct cpumask __cpu_active_mask; #define cpu_possible_mask ((const struct cpumask *)&__cpu_possible_mask) #define cpu_online_mask ((const struct cpumask *)&__cpu_online_mask) #define cpu_present_mask ((const struct cpumask *)&__cpu_present_mask) #define cpu_active_mask ((const struct cpumask *)&__cpu_active_mask) extern atomic_t __num_online_cpus; #if NR_CPUS > 1 /** * num_online_cpus() - Read the number of online CPUs * * Despite the fact that __num_online_cpus is of type atomic_t, this * interface gives only a momentary snapshot and is not protected against * concurrent CPU hotplug operations unless invoked from a cpuhp_lock held * region. */ static inline unsigned int num_online_cpus(void) { return atomic_read(&__num_online_cpus); } #define num_possible_cpus() cpumask_weight(cpu_possible_mask) #define num_present_cpus() cpumask_weight(cpu_present_mask) #define num_active_cpus() cpumask_weight(cpu_active_mask) #define cpu_online(cpu) cpumask_test_cpu((cpu), cpu_online_mask) #define cpu_possible(cpu) cpumask_test_cpu((cpu), cpu_possible_mask) #define cpu_present(cpu) cpumask_test_cpu((cpu), cpu_present_mask) #define cpu_active(cpu) cpumask_test_cpu((cpu), cpu_active_mask) #else #define num_online_cpus() 1U #define num_possible_cpus() 1U #define num_present_cpus() 1U #define num_active_cpus() 1U #define cpu_online(cpu) ((cpu) == 0) #define cpu_possible(cpu) ((cpu) == 0) #define cpu_present(cpu) ((cpu) == 0) #define cpu_active(cpu) ((cpu) == 0) #endif extern cpumask_t cpus_booted_once_mask; static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits) { #ifdef CONFIG_DEBUG_PER_CPU_MAPS WARN_ON_ONCE(cpu >= bits); #endif /* CONFIG_DEBUG_PER_CPU_MAPS */ } /* verify cpu argument to cpumask_* operators */ static inline unsigned int cpumask_check(unsigned int cpu) { cpu_max_bits_warn(cpu, nr_cpumask_bits); return cpu; } #if NR_CPUS == 1 /* Uniprocessor. Assume all masks are "1". */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return 0; } static inline unsigned int cpumask_last(const struct cpumask *srcp) { return 0; } /* Valid inputs for n are -1 and 0. */ static inline unsigned int cpumask_next(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { return n+1; } static inline unsigned int cpumask_next_and(int n, const struct cpumask *srcp, const struct cpumask *andp) { return n+1; } static inline unsigned int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap) { /* cpu0 unless stop condition, wrap and at cpu0, then nr_cpumask_bits */ return (wrap && n == 0); } /* cpu must be a valid cpu, ie 0, so there's no other choice. */ static inline unsigned int cpumask_any_but(const struct cpumask *mask, unsigned int cpu) { return 1; } static inline unsigned int cpumask_local_spread(unsigned int i, int node) { return 0; } static inline int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p) { return cpumask_next_and(-1, src1p, src2p); } #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask, (void)(start)) #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask1, (void)mask2) #else /** * cpumask_first - get the first cpu in a cpumask * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no cpus set. */ static inline unsigned int cpumask_first(const struct cpumask *srcp) { return find_first_bit(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_last - get the last CPU in a cpumask * @srcp: - the cpumask pointer * * Returns >= nr_cpumask_bits if no CPUs set. */ static inline unsigned int cpumask_last(const struct cpumask *srcp) { return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits); } unsigned int cpumask_next(int n, const struct cpumask *srcp); /** * cpumask_next_zero - get the next unset cpu in a cpumask * @n: the cpu prior to the place to search (ie. return will be > @n) * @srcp: the cpumask pointer * * Returns >= nr_cpu_ids if no further cpus unset. */ static inline unsigned int cpumask_next_zero(int n, const struct cpumask *srcp) { /* -1 is a legal arg here. */ if (n != -1) cpumask_check(n); return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1); } int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *); int cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); /** * for_each_cpu - iterate over every cpu in a mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next((cpu), (mask)), \ (cpu) < nr_cpu_ids;) /** * for_each_cpu_not - iterate over every cpu in a complemented mask * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask pointer * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_not(cpu, mask) \ for ((cpu) = -1; \ (cpu) = cpumask_next_zero((cpu), (mask)), \ (cpu) < nr_cpu_ids;) extern int cpumask_next_wrap(int n, const struct cpumask *mask, int start, bool wrap); /** * for_each_cpu_wrap - iterate over every cpu in a mask, starting at a specified location * @cpu: the (optionally unsigned) integer iterator * @mask: the cpumask poiter * @start: the start location * * The implementation does not assume any bit in @mask is set (including @start). * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_wrap(cpu, mask, start) \ for ((cpu) = cpumask_next_wrap((start)-1, (mask), (start), false); \ (cpu) < nr_cpumask_bits; \ (cpu) = cpumask_next_wrap((cpu), (mask), (start), true)) /** * for_each_cpu_and - iterate over every cpu in both masks * @cpu: the (optionally unsigned) integer iterator * @mask1: the first cpumask pointer * @mask2: the second cpumask pointer * * This saves a temporary CPU mask in many places. It is equivalent to: * struct cpumask tmp; * cpumask_and(&tmp, &mask1, &mask2); * for_each_cpu(cpu, &tmp) * ... * * After the loop, cpu is >= nr_cpu_ids. */ #define for_each_cpu_and(cpu, mask1, mask2) \ for ((cpu) = -1; \ (cpu) = cpumask_next_and((cpu), (mask1), (mask2)), \ (cpu) < nr_cpu_ids;) #endif /* SMP */ #define CPU_BITS_NONE \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } #define CPU_BITS_CPU0 \ { \ [0] = 1UL \ } /** * cpumask_set_cpu - set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_set_cpu(unsigned int cpu, struct cpumask *dstp) { __set_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_clear_cpu - clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @dstp: the cpumask pointer */ static inline void cpumask_clear_cpu(int cpu, struct cpumask *dstp) { clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } static inline void __cpumask_clear_cpu(int cpu, struct cpumask *dstp) { __clear_bit(cpumask_check(cpu), cpumask_bits(dstp)); } /** * cpumask_test_cpu - test for a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in @cpumask, else returns 0 */ static inline int cpumask_test_cpu(int cpu, const struct cpumask *cpumask) { return test_bit(cpumask_check(cpu), cpumask_bits((cpumask))); } /** * cpumask_test_and_set_cpu - atomically test and set a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_set_bit wrapper for cpumasks. */ static inline int cpumask_test_and_set_cpu(int cpu, struct cpumask *cpumask) { return test_and_set_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_test_and_clear_cpu - atomically test and clear a cpu in a cpumask * @cpu: cpu number (< nr_cpu_ids) * @cpumask: the cpumask pointer * * Returns 1 if @cpu is set in old bitmap of @cpumask, else returns 0 * * test_and_clear_bit wrapper for cpumasks. */ static inline int cpumask_test_and_clear_cpu(int cpu, struct cpumask *cpumask) { return test_and_clear_bit(cpumask_check(cpu), cpumask_bits(cpumask)); } /** * cpumask_setall - set all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_setall(struct cpumask *dstp) { bitmap_fill(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_clear - clear all cpus (< nr_cpu_ids) in a cpumask * @dstp: the cpumask pointer */ static inline void cpumask_clear(struct cpumask *dstp) { bitmap_zero(cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_and - *dstp = *src1p & *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_and(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_and(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_or - *dstp = *src1p | *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_or(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_or(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_xor - *dstp = *src1p ^ *src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input */ static inline void cpumask_xor(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { bitmap_xor(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_andnot - *dstp = *src1p & ~*src2p * @dstp: the cpumask result * @src1p: the first input * @src2p: the second input * * If *@dstp is empty, returns 0, else returns 1 */ static inline int cpumask_andnot(struct cpumask *dstp, const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_andnot(cpumask_bits(dstp), cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_complement - *dstp = ~*srcp * @dstp: the cpumask result * @srcp: the input to invert */ static inline void cpumask_complement(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_complement(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_equal - *src1p == *src2p * @src1p: the first input * @src2p: the second input */ static inline bool cpumask_equal(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_equal(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_or_equal - *src1p | *src2p == *src3p * @src1p: the first input * @src2p: the second input * @src3p: the third input */ static inline bool cpumask_or_equal(const struct cpumask *src1p, const struct cpumask *src2p, const struct cpumask *src3p) { return bitmap_or_equal(cpumask_bits(src1p), cpumask_bits(src2p), cpumask_bits(src3p), nr_cpumask_bits); } /** * cpumask_intersects - (*src1p & *src2p) != 0 * @src1p: the first input * @src2p: the second input */ static inline bool cpumask_intersects(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_intersects(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_subset - (*src1p & ~*src2p) == 0 * @src1p: the first input * @src2p: the second input * * Returns 1 if *@src1p is a subset of *@src2p, else returns 0 */ static inline int cpumask_subset(const struct cpumask *src1p, const struct cpumask *src2p) { return bitmap_subset(cpumask_bits(src1p), cpumask_bits(src2p), nr_cpumask_bits); } /** * cpumask_empty - *srcp == 0 * @srcp: the cpumask to that all cpus < nr_cpu_ids are clear. */ static inline bool cpumask_empty(const struct cpumask *srcp) { return bitmap_empty(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_full - *srcp == 0xFFFFFFFF... * @srcp: the cpumask to that all cpus < nr_cpu_ids are set. */ static inline bool cpumask_full(const struct cpumask *srcp) { return bitmap_full(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_weight - Count of bits in *srcp * @srcp: the cpumask to count bits (< nr_cpu_ids) in. */ static inline unsigned int cpumask_weight(const struct cpumask *srcp) { return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_shift_right - *dstp = *srcp >> n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static inline void cpumask_shift_right(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_right(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_shift_left - *dstp = *srcp << n * @dstp: the cpumask result * @srcp: the input to shift * @n: the number of bits to shift by */ static inline void cpumask_shift_left(struct cpumask *dstp, const struct cpumask *srcp, int n) { bitmap_shift_left(cpumask_bits(dstp), cpumask_bits(srcp), n, nr_cpumask_bits); } /** * cpumask_copy - *dstp = *srcp * @dstp: the result * @srcp: the input cpumask */ static inline void cpumask_copy(struct cpumask *dstp, const struct cpumask *srcp) { bitmap_copy(cpumask_bits(dstp), cpumask_bits(srcp), nr_cpumask_bits); } /** * cpumask_any - pick a "random" cpu from *srcp * @srcp: the input cpumask * * Returns >= nr_cpu_ids if no cpus set. */ #define cpumask_any(srcp) cpumask_first(srcp) /** * cpumask_first_and - return the first cpu from *srcp1 & *srcp2 * @src1p: the first input * @src2p: the second input * * Returns >= nr_cpu_ids if no cpus set in both. See also cpumask_next_and(). */ #define cpumask_first_and(src1p, src2p) cpumask_next_and(-1, (src1p), (src2p)) /** * cpumask_any_and - pick a "random" cpu from *mask1 & *mask2 * @mask1: the first input cpumask * @mask2: the second input cpumask * * Returns >= nr_cpu_ids if no cpus set. */ #define cpumask_any_and(mask1, mask2) cpumask_first_and((mask1), (mask2)) /** * cpumask_of - the cpumask containing just a given cpu * @cpu: the cpu (<= nr_cpu_ids) */ #define cpumask_of(cpu) (get_cpu_mask(cpu)) /** * cpumask_parse_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parse_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parse_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parselist_user - extract a cpumask from a user string * @buf: the buffer to extract from * @len: the length of the buffer * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parselist_user(const char __user *buf, int len, struct cpumask *dstp) { return bitmap_parselist_user(buf, len, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_parse - extract a cpumask from a string * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpumask_parse(const char *buf, struct cpumask *dstp) { return bitmap_parse(buf, UINT_MAX, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpulist_parse - extract a cpumask from a user string of ranges * @buf: the buffer to extract from * @dstp: the cpumask to set. * * Returns -errno, or 0 for success. */ static inline int cpulist_parse(const char *buf, struct cpumask *dstp) { return bitmap_parselist(buf, cpumask_bits(dstp), nr_cpumask_bits); } /** * cpumask_size - size to allocate for a 'struct cpumask' in bytes */ static inline unsigned int cpumask_size(void) { return BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long); } /* * cpumask_var_t: struct cpumask for stack usage. * * Oh, the wicked games we play! In order to make kernel coding a * little more difficult, we typedef cpumask_var_t to an array or a * pointer: doing &mask on an array is a noop, so it still works. * * ie. * cpumask_var_t tmpmask; * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) * return -ENOMEM; * * ... use 'tmpmask' like a normal struct cpumask * ... * * free_cpumask_var(tmpmask); * * * However, one notable exception is there. alloc_cpumask_var() allocates * only nr_cpumask_bits bits (in the other hand, real cpumask_t always has * NR_CPUS bits). Therefore you don't have to dereference cpumask_var_t. * * cpumask_var_t tmpmask; * if (!alloc_cpumask_var(&tmpmask, GFP_KERNEL)) * return -ENOMEM; * * var = *tmpmask; * * This code makes NR_CPUS length memcopy and brings to a memory corruption. * cpumask_copy() provide safe copy functionality. * * Note that there is another evil here: If you define a cpumask_var_t * as a percpu variable then the way to obtain the address of the cpumask * structure differently influences what this_cpu_* operation needs to be * used. Please use this_cpu_cpumask_var_t in those cases. The direct use * of this_cpu_ptr() or this_cpu_read() will lead to failures when the * other type of cpumask_var_t implementation is configured. * * Please also note that __cpumask_var_read_mostly can be used to declare * a cpumask_var_t variable itself (not its content) as read mostly. */ #ifdef CONFIG_CPUMASK_OFFSTACK typedef struct cpumask *cpumask_var_t; #define this_cpu_cpumask_var_ptr(x) this_cpu_read(x) #define __cpumask_var_read_mostly __read_mostly bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node); bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags); void alloc_bootmem_cpumask_var(cpumask_var_t *mask); void free_cpumask_var(cpumask_var_t mask); void free_bootmem_cpumask_var(cpumask_var_t mask); static inline bool cpumask_available(cpumask_var_t mask) { return mask != NULL; } #else typedef struct cpumask cpumask_var_t[1]; #define this_cpu_cpumask_var_ptr(x) this_cpu_ptr(x) #define __cpumask_var_read_mostly static inline bool alloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { return true; } static inline bool alloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { return true; } static inline bool zalloc_cpumask_var(cpumask_var_t *mask, gfp_t flags) { cpumask_clear(*mask); return true; } static inline bool zalloc_cpumask_var_node(cpumask_var_t *mask, gfp_t flags, int node) { cpumask_clear(*mask); return true; } static inline void alloc_bootmem_cpumask_var(cpumask_var_t *mask) { } static inline void free_cpumask_var(cpumask_var_t mask) { } static inline void free_bootmem_cpumask_var(cpumask_var_t mask) { } static inline bool cpumask_available(cpumask_var_t mask) { return true; } #endif /* CONFIG_CPUMASK_OFFSTACK */ /* It's common to want to use cpu_all_mask in struct member initializers, * so it has to refer to an address rather than a pointer. */ extern const DECLARE_BITMAP(cpu_all_bits, NR_CPUS); #define cpu_all_mask to_cpumask(cpu_all_bits) /* First bits of cpu_bit_bitmap are in fact unset. */ #define cpu_none_mask to_cpumask(cpu_bit_bitmap[0]) #define for_each_possible_cpu(cpu) for_each_cpu((cpu), cpu_possible_mask) #define for_each_online_cpu(cpu) for_each_cpu((cpu), cpu_online_mask) #define for_each_present_cpu(cpu) for_each_cpu((cpu), cpu_present_mask) /* Wrappers for arch boot code to manipulate normally-constant masks */ void init_cpu_present(const struct cpumask *src); void init_cpu_possible(const struct cpumask *src); void init_cpu_online(const struct cpumask *src); static inline void reset_cpu_possible_mask(void) { bitmap_zero(cpumask_bits(&__cpu_possible_mask), NR_CPUS); } static inline void set_cpu_possible(unsigned int cpu, bool possible) { if (possible) cpumask_set_cpu(cpu, &__cpu_possible_mask); else cpumask_clear_cpu(cpu, &__cpu_possible_mask); } static inline void set_cpu_present(unsigned int cpu, bool present) { if (present) cpumask_set_cpu(cpu, &__cpu_present_mask); else cpumask_clear_cpu(cpu, &__cpu_present_mask); } void set_cpu_online(unsigned int cpu, bool online); static inline void set_cpu_active(unsigned int cpu, bool active) { if (active) cpumask_set_cpu(cpu, &__cpu_active_mask); else cpumask_clear_cpu(cpu, &__cpu_active_mask); } /** * to_cpumask - convert an NR_CPUS bitmap to a struct cpumask * * @bitmap: the bitmap * * There are a few places where cpumask_var_t isn't appropriate and * static cpumasks must be used (eg. very early boot), yet we don't * expose the definition of 'struct cpumask'. * * This does the conversion, and can be used as a constant initializer. */ #define to_cpumask(bitmap) \ ((struct cpumask *)(1 ? (bitmap) \ : (void *)sizeof(__check_is_bitmap(bitmap)))) static inline int __check_is_bitmap(const unsigned long *bitmap) { return 1; } /* * Special-case data structure for "single bit set only" constant CPU masks. * * We pre-generate all the 64 (or 32) possible bit positions, with enough * padding to the left and the right, and return the constant pointer * appropriately offset. */ extern const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)]; static inline const struct cpumask *get_cpu_mask(unsigned int cpu) { const unsigned long *p = cpu_bit_bitmap[1 + cpu % BITS_PER_LONG]; p -= cpu / BITS_PER_LONG; return to_cpumask(p); } #define cpu_is_offline(cpu) unlikely(!cpu_online(cpu)) #if NR_CPUS <= BITS_PER_LONG #define CPU_BITS_ALL \ { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #else /* NR_CPUS > BITS_PER_LONG */ #define CPU_BITS_ALL \ { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } #endif /* NR_CPUS > BITS_PER_LONG */ /** * cpumap_print_to_pagebuf - copies the cpumask into the buffer either * as comma-separated list of cpus or hex values of cpumask * @list: indicates whether the cpumap must be list * @mask: the cpumask to copy * @buf: the buffer to copy into * * Returns the length of the (null-terminated) @buf string, zero if * nothing is copied. */ static inline ssize_t cpumap_print_to_pagebuf(bool list, char *buf, const struct cpumask *mask) { return bitmap_print_to_pagebuf(list, buf, cpumask_bits(mask), nr_cpu_ids); } #if NR_CPUS <= BITS_PER_LONG #define CPU_MASK_ALL \ (cpumask_t) { { \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #else #define CPU_MASK_ALL \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ [BITS_TO_LONGS(NR_CPUS)-1] = BITMAP_LAST_WORD_MASK(NR_CPUS) \ } } #endif /* NR_CPUS > BITS_PER_LONG */ #define CPU_MASK_NONE \ (cpumask_t) { { \ [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ } } #define CPU_MASK_CPU0 \ (cpumask_t) { { \ [0] = 1UL \ } } #endif /* __LINUX_CPUMASK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM compaction #if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_COMPACTION_H #include <linux/types.h> #include <linux/list.h> #include <linux/tracepoint.h> #include <trace/events/mmflags.h> DECLARE_EVENT_CLASS(mm_compaction_isolate_template, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken), TP_STRUCT__entry( __field(unsigned long, start_pfn) __field(unsigned long, end_pfn) __field(unsigned long, nr_scanned) __field(unsigned long, nr_taken) ), TP_fast_assign( __entry->start_pfn = start_pfn; __entry->end_pfn = end_pfn; __entry->nr_scanned = nr_scanned; __entry->nr_taken = nr_taken; ), TP_printk("range=(0x%lx ~ 0x%lx) nr_scanned=%lu nr_taken=%lu", __entry->start_pfn, __entry->end_pfn, __entry->nr_scanned, __entry->nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TP_PROTO( unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_scanned, unsigned long nr_taken), TP_ARGS(start_pfn, end_pfn, nr_scanned, nr_taken) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned long nr_all, int migrate_rc, struct list_head *migratepages), TP_ARGS(nr_all, migrate_rc, migratepages), TP_STRUCT__entry( __field(unsigned long, nr_migrated) __field(unsigned long, nr_failed) ), TP_fast_assign( unsigned long nr_failed = 0; struct list_head *page_lru; /* * migrate_pages() returns either a non-negative number * with the number of pages that failed migration, or an * error code, in which case we need to count the remaining * pages manually */ if (migrate_rc >= 0) nr_failed = migrate_rc; else list_for_each(page_lru, migratepages) nr_failed++; __entry->nr_migrated = nr_all - nr_failed; __entry->nr_failed = nr_failed; ), TP_printk("nr_migrated=%lu nr_failed=%lu", __entry->nr_migrated, __entry->nr_failed) ); TRACE_EVENT(mm_compaction_begin, TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, unsigned long free_pfn, unsigned long zone_end, bool sync), TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = migrate_pfn; __entry->free_pfn = free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async") ); TRACE_EVENT(mm_compaction_end, TP_PROTO(unsigned long zone_start, unsigned long migrate_pfn, unsigned long free_pfn, unsigned long zone_end, bool sync, int status), TP_ARGS(zone_start, migrate_pfn, free_pfn, zone_end, sync, status), TP_STRUCT__entry( __field(unsigned long, zone_start) __field(unsigned long, migrate_pfn) __field(unsigned long, free_pfn) __field(unsigned long, zone_end) __field(bool, sync) __field(int, status) ), TP_fast_assign( __entry->zone_start = zone_start; __entry->migrate_pfn = migrate_pfn; __entry->free_pfn = free_pfn; __entry->zone_end = zone_end; __entry->sync = sync; __entry->status = status; ), TP_printk("zone_start=0x%lx migrate_pfn=0x%lx free_pfn=0x%lx zone_end=0x%lx, mode=%s status=%s", __entry->zone_start, __entry->migrate_pfn, __entry->free_pfn, __entry->zone_end, __entry->sync ? "sync" : "async", __print_symbolic(__entry->status, COMPACTION_STATUS)) ); TRACE_EVENT(mm_compaction_try_to_compact_pages, TP_PROTO( int order, gfp_t gfp_mask, int prio), TP_ARGS(order, gfp_mask, prio), TP_STRUCT__entry( __field(int, order) __field(gfp_t, gfp_mask) __field(int, prio) ), TP_fast_assign( __entry->order = order; __entry->gfp_mask = gfp_mask; __entry->prio = prio; ), TP_printk("order=%d gfp_mask=%s priority=%d", __entry->order, show_gfp_flags(__entry->gfp_mask), __entry->prio) ); DECLARE_EVENT_CLASS(mm_compaction_suitable_template, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(int, ret) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->ret = ret; ), TP_printk("node=%d zone=%-8s order=%d ret=%s", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __print_symbolic(__entry->ret, COMPACTION_STATUS)) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_finished, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DEFINE_EVENT(mm_compaction_suitable_template, mm_compaction_suitable, TP_PROTO(struct zone *zone, int order, int ret), TP_ARGS(zone, order, ret) ); DECLARE_EVENT_CLASS(mm_compaction_defer_template, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order), TP_STRUCT__entry( __field(int, nid) __field(enum zone_type, idx) __field(int, order) __field(unsigned int, considered) __field(unsigned int, defer_shift) __field(int, order_failed) ), TP_fast_assign( __entry->nid = zone_to_nid(zone); __entry->idx = zone_idx(zone); __entry->order = order; __entry->considered = zone->compact_considered; __entry->defer_shift = zone->compact_defer_shift; __entry->order_failed = zone->compact_order_failed; ), TP_printk("node=%d zone=%-8s order=%d order_failed=%d consider=%u limit=%lu", __entry->nid, __print_symbolic(__entry->idx, ZONE_TYPE), __entry->order, __entry->order_failed, __entry->considered, 1UL << __entry->defer_shift) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_deferred, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_compaction, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); DEFINE_EVENT(mm_compaction_defer_template, mm_compaction_defer_reset, TP_PROTO(struct zone *zone, int order), TP_ARGS(zone, order) ); TRACE_EVENT(mm_compaction_kcompactd_sleep, TP_PROTO(int nid), TP_ARGS(nid), TP_STRUCT__entry( __field(int, nid) ), TP_fast_assign( __entry->nid = nid; ), TP_printk("nid=%d", __entry->nid) ); DECLARE_EVENT_CLASS(kcompactd_wake_template, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx), TP_STRUCT__entry( __field(int, nid) __field(int, order) __field(enum zone_type, highest_zoneidx) ), TP_fast_assign( __entry->nid = nid; __entry->order = order; __entry->highest_zoneidx = highest_zoneidx; ), /* * classzone_idx is previous name of the highest_zoneidx. * Reason not to change it is the ABI requirement of the tracepoint. */ TP_printk("nid=%d order=%d classzone_idx=%-8s", __entry->nid, __entry->order, __print_symbolic(__entry->highest_zoneidx, ZONE_TYPE)) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_wakeup_kcompactd, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); DEFINE_EVENT(kcompactd_wake_template, mm_compaction_kcompactd_wake, TP_PROTO(int nid, int order, enum zone_type highest_zoneidx), TP_ARGS(nid, order, highest_zoneidx) ); #endif #endif /* _TRACE_COMPACTION_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM udp #if !defined(_TRACE_UDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_UDP_H #include <linux/udp.h> #include <linux/tracepoint.h> TRACE_EVENT(udp_fail_queue_rcv_skb, TP_PROTO(int rc, struct sock *sk), TP_ARGS(rc, sk), TP_STRUCT__entry( __field(int, rc) __field(__u16, lport) ), TP_fast_assign( __entry->rc = rc; __entry->lport = inet_sk(sk)->inet_num; ), TP_printk("rc=%d port=%hu", __entry->rc, __entry->lport) ); #endif /* _TRACE_UDP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 // SPDX-License-Identifier: GPL-2.0 #include <linux/export.h> #include <linux/lockref.h> #if USE_CMPXCHG_LOCKREF /* * Note that the "cmpxchg()" reloads the "old" value for the * failure case. */ #define CMPXCHG_LOOP(CODE, SUCCESS) do { \ int retry = 100; \ struct lockref old; \ BUILD_BUG_ON(sizeof(old) != 8); \ old.lock_count = READ_ONCE(lockref->lock_count); \ while (likely(arch_spin_value_unlocked(old.lock.rlock.raw_lock))) { \ struct lockref new = old, prev = old; \ CODE \ old.lock_count = cmpxchg64_relaxed(&lockref->lock_count, \ old.lock_count, \ new.lock_count); \ if (likely(old.lock_count == prev.lock_count)) { \ SUCCESS; \ } \ if (!--retry) \ break; \ cpu_relax(); \ } \ } while (0) #else #define CMPXCHG_LOOP(CODE, SUCCESS) do { } while (0) #endif /** * lockref_get - Increments reference count unconditionally * @lockref: pointer to lockref structure * * This operation is only valid if you already hold a reference * to the object, so you know the count cannot be zero. */ void lockref_get(struct lockref *lockref) { CMPXCHG_LOOP( new.count++; , return; ); spin_lock(&lockref->lock); lockref->count++; spin_unlock(&lockref->lock); } EXPORT_SYMBOL(lockref_get); /** * lockref_get_not_zero - Increments count unless the count is 0 or dead * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count was zero */ int lockref_get_not_zero(struct lockref *lockref) { int retval; CMPXCHG_LOOP( new.count++; if (old.count <= 0) return 0; , return 1; ); spin_lock(&lockref->lock); retval = 0; if (lockref->count > 0) { lockref->count++; retval = 1; } spin_unlock(&lockref->lock); return retval; } EXPORT_SYMBOL(lockref_get_not_zero); /** * lockref_put_not_zero - Decrements count unless count <= 1 before decrement * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count would become zero */ int lockref_put_not_zero(struct lockref *lockref) { int retval; CMPXCHG_LOOP( new.count--; if (old.count <= 1) return 0; , return 1; ); spin_lock(&lockref->lock); retval = 0; if (lockref->count > 1) { lockref->count--; retval = 1; } spin_unlock(&lockref->lock); return retval; } EXPORT_SYMBOL(lockref_put_not_zero); /** * lockref_get_or_lock - Increments count unless the count is 0 or dead * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count was zero * and we got the lock instead. */ int lockref_get_or_lock(struct lockref *lockref) { CMPXCHG_LOOP( new.count++; if (old.count <= 0) break; , return 1; ); spin_lock(&lockref->lock); if (lockref->count <= 0) return 0; lockref->count++; spin_unlock(&lockref->lock); return 1; } EXPORT_SYMBOL(lockref_get_or_lock); /** * lockref_put_return - Decrement reference count if possible * @lockref: pointer to lockref structure * * Decrement the reference count and return the new value. * If the lockref was dead or locked, return an error. */ int lockref_put_return(struct lockref *lockref) { CMPXCHG_LOOP( new.count--; if (old.count <= 0) return -1; , return new.count; ); return -1; } EXPORT_SYMBOL(lockref_put_return); /** * lockref_put_or_lock - decrements count unless count <= 1 before decrement * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if count <= 1 and lock taken */ int lockref_put_or_lock(struct lockref *lockref) { CMPXCHG_LOOP( new.count--; if (old.count <= 1) break; , return 1; ); spin_lock(&lockref->lock); if (lockref->count <= 1) return 0; lockref->count--; spin_unlock(&lockref->lock); return 1; } EXPORT_SYMBOL(lockref_put_or_lock); /** * lockref_mark_dead - mark lockref dead * @lockref: pointer to lockref structure */ void lockref_mark_dead(struct lockref *lockref) { assert_spin_locked(&lockref->lock); lockref->count = -128; } EXPORT_SYMBOL(lockref_mark_dead); /** * lockref_get_not_dead - Increments count unless the ref is dead * @lockref: pointer to lockref structure * Return: 1 if count updated successfully or 0 if lockref was dead */ int lockref_get_not_dead(struct lockref *lockref) { int retval; CMPXCHG_LOOP( new.count++; if (old.count < 0) return 0; , return 1; ); spin_lock(&lockref->lock); retval = 0; if (lockref->count >= 0) { lockref->count++; retval = 1; } spin_unlock(&lockref->lock); return retval; } EXPORT_SYMBOL(lockref_get_not_dead);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_IP6_ROUTE_H #define _NET_IP6_ROUTE_H struct route_info { __u8 type; __u8 length; __u8 prefix_len; #if defined(__BIG_ENDIAN_BITFIELD) __u8 reserved_h:3, route_pref:2, reserved_l:3; #elif defined(__LITTLE_ENDIAN_BITFIELD) __u8 reserved_l:3, route_pref:2, reserved_h:3; #endif __be32 lifetime; __u8 prefix[]; /* 0,8 or 16 */ }; #include <net/addrconf.h> #include <net/flow.h> #include <net/ip6_fib.h> #include <net/sock.h> #include <net/lwtunnel.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/route.h> #include <net/nexthop.h> #define RT6_LOOKUP_F_IFACE 0x00000001 #define RT6_LOOKUP_F_REACHABLE 0x00000002 #define RT6_LOOKUP_F_HAS_SADDR 0x00000004 #define RT6_LOOKUP_F_SRCPREF_TMP 0x00000008 #define RT6_LOOKUP_F_SRCPREF_PUBLIC 0x00000010 #define RT6_LOOKUP_F_SRCPREF_COA 0x00000020 #define RT6_LOOKUP_F_IGNORE_LINKSTATE 0x00000040 #define RT6_LOOKUP_F_DST_NOREF 0x00000080 /* We do not (yet ?) support IPv6 jumbograms (RFC 2675) * Unlike IPv4, hdr->seg_len doesn't include the IPv6 header */ #define IP6_MAX_MTU (0xFFFF + sizeof(struct ipv6hdr)) /* * rt6_srcprefs2flags() and rt6_flags2srcprefs() translate * between IPV6_ADDR_PREFERENCES socket option values * IPV6_PREFER_SRC_TMP = 0x1 * IPV6_PREFER_SRC_PUBLIC = 0x2 * IPV6_PREFER_SRC_COA = 0x4 * and above RT6_LOOKUP_F_SRCPREF_xxx flags. */ static inline int rt6_srcprefs2flags(unsigned int srcprefs) { /* No need to bitmask because srcprefs have only 3 bits. */ return srcprefs << 3; } static inline unsigned int rt6_flags2srcprefs(int flags) { return (flags >> 3) & 7; } static inline bool rt6_need_strict(const struct in6_addr *daddr) { return ipv6_addr_type(daddr) & (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } /* fib entries using a nexthop object can not be coalesced into * a multipath route */ static inline bool rt6_qualify_for_ecmp(const struct fib6_info *f6i) { /* the RTF_ADDRCONF flag filters out RA's */ return !(f6i->fib6_flags & RTF_ADDRCONF) && !f6i->nh && f6i->fib6_nh->fib_nh_gw_family; } void ip6_route_input(struct sk_buff *skb); struct dst_entry *ip6_route_input_lookup(struct net *net, struct net_device *dev, struct flowi6 *fl6, const struct sk_buff *skb, int flags); struct dst_entry *ip6_route_output_flags_noref(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk, struct flowi6 *fl6, int flags); static inline struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk, struct flowi6 *fl6) { return ip6_route_output_flags(net, sk, fl6, 0); } /* Only conditionally release dst if flags indicates * !RT6_LOOKUP_F_DST_NOREF or dst is in uncached_list. */ static inline void ip6_rt_put_flags(struct rt6_info *rt, int flags) { if (!(flags & RT6_LOOKUP_F_DST_NOREF) || !list_empty(&rt->rt6i_uncached)) ip6_rt_put(rt); } struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags); struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int ifindex, struct flowi6 *fl6, const struct sk_buff *skb, int flags); void ip6_route_init_special_entries(void); int ip6_route_init(void); void ip6_route_cleanup(void); int ipv6_route_ioctl(struct net *net, unsigned int cmd, struct in6_rtmsg *rtmsg); int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); int ip6_ins_rt(struct net *net, struct fib6_info *f6i); int ip6_del_rt(struct net *net, struct fib6_info *f6i, bool skip_notify); void rt6_flush_exceptions(struct fib6_info *f6i); void rt6_age_exceptions(struct fib6_info *f6i, struct fib6_gc_args *gc_args, unsigned long now); static inline int ip6_route_get_saddr(struct net *net, struct fib6_info *f6i, const struct in6_addr *daddr, unsigned int prefs, struct in6_addr *saddr) { int err = 0; if (f6i && f6i->fib6_prefsrc.plen) { *saddr = f6i->fib6_prefsrc.addr; } else { struct net_device *dev = f6i ? fib6_info_nh_dev(f6i) : NULL; err = ipv6_dev_get_saddr(net, dev, daddr, prefs, saddr); } return err; } struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr, int oif, const struct sk_buff *skb, int flags); u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6, const struct sk_buff *skb, struct flow_keys *hkeys); struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct flowi6 *fl6); void fib6_force_start_gc(struct net *net); struct fib6_info *addrconf_f6i_alloc(struct net *net, struct inet6_dev *idev, const struct in6_addr *addr, bool anycast, gfp_t gfp_flags); struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev, int flags); /* * support functions for ND * */ struct fib6_info *rt6_get_dflt_router(struct net *net, const struct in6_addr *addr, struct net_device *dev); struct fib6_info *rt6_add_dflt_router(struct net *net, const struct in6_addr *gwaddr, struct net_device *dev, unsigned int pref); void rt6_purge_dflt_routers(struct net *net); int rt6_route_rcv(struct net_device *dev, u8 *opt, int len, const struct in6_addr *gwaddr); void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu, int oif, u32 mark, kuid_t uid); void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu); void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark, kuid_t uid); void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif); void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk); struct netlink_callback; struct rt6_rtnl_dump_arg { struct sk_buff *skb; struct netlink_callback *cb; struct net *net; struct fib_dump_filter filter; }; int rt6_dump_route(struct fib6_info *f6i, void *p_arg, unsigned int skip); void rt6_mtu_change(struct net_device *dev, unsigned int mtu); void rt6_remove_prefsrc(struct inet6_ifaddr *ifp); void rt6_clean_tohost(struct net *net, struct in6_addr *gateway); void rt6_sync_up(struct net_device *dev, unsigned char nh_flags); void rt6_disable_ip(struct net_device *dev, unsigned long event); void rt6_sync_down_dev(struct net_device *dev, unsigned long event); void rt6_multipath_rebalance(struct fib6_info *f6i); void rt6_uncached_list_add(struct rt6_info *rt); void rt6_uncached_list_del(struct rt6_info *rt); static inline const struct rt6_info *skb_rt6_info(const struct sk_buff *skb) { const struct dst_entry *dst = skb_dst(skb); const struct rt6_info *rt6 = NULL; if (dst) rt6 = container_of(dst, struct rt6_info, dst); return rt6; } /* * Store a destination cache entry in a socket */ static inline void ip6_dst_store(struct sock *sk, struct dst_entry *dst, const struct in6_addr *daddr, const struct in6_addr *saddr) { struct ipv6_pinfo *np = inet6_sk(sk); np->dst_cookie = rt6_get_cookie((struct rt6_info *)dst); sk_setup_caps(sk, dst); np->daddr_cache = daddr; #ifdef CONFIG_IPV6_SUBTREES np->saddr_cache = saddr; #endif } void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst, const struct flowi6 *fl6); static inline bool ipv6_unicast_destination(const struct sk_buff *skb) { struct rt6_info *rt = (struct rt6_info *) skb_dst(skb); return rt->rt6i_flags & RTF_LOCAL; } static inline bool ipv6_anycast_destination(const struct dst_entry *dst, const struct in6_addr *daddr) { struct rt6_info *rt = (struct rt6_info *)dst; return rt->rt6i_flags & RTF_ANYCAST || (rt->rt6i_dst.plen < 127 && !(rt->rt6i_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) && ipv6_addr_equal(&rt->rt6i_dst.addr, daddr)); } int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, int (*output)(struct net *, struct sock *, struct sk_buff *)); static inline unsigned int ip6_skb_dst_mtu(struct sk_buff *skb) { unsigned int mtu; struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? inet6_sk(skb->sk) : NULL; if (np && np->pmtudisc >= IPV6_PMTUDISC_PROBE) { mtu = READ_ONCE(skb_dst(skb)->dev->mtu); mtu -= lwtunnel_headroom(skb_dst(skb)->lwtstate, mtu); } else mtu = dst_mtu(skb_dst(skb)); return mtu; } static inline bool ip6_sk_accept_pmtu(const struct sock *sk) { return inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_INTERFACE && inet6_sk(sk)->pmtudisc != IPV6_PMTUDISC_OMIT; } static inline bool ip6_sk_ignore_df(const struct sock *sk) { return inet6_sk(sk)->pmtudisc < IPV6_PMTUDISC_DO || inet6_sk(sk)->pmtudisc == IPV6_PMTUDISC_OMIT; } static inline const struct in6_addr *rt6_nexthop(const struct rt6_info *rt, const struct in6_addr *daddr) { if (rt->rt6i_flags & RTF_GATEWAY) return &rt->rt6i_gateway; else if (unlikely(rt->rt6i_flags & RTF_CACHE)) return &rt->rt6i_dst.addr; else return daddr; } static inline bool rt6_duplicate_nexthop(struct fib6_info *a, struct fib6_info *b) { struct fib6_nh *nha, *nhb; if (a->nh || b->nh) return nexthop_cmp(a->nh, b->nh); nha = a->fib6_nh; nhb = b->fib6_nh; return nha->fib_nh_dev == nhb->fib_nh_dev && ipv6_addr_equal(&nha->fib_nh_gw6, &nhb->fib_nh_gw6) && !lwtunnel_cmp_encap(nha->fib_nh_lws, nhb->fib_nh_lws); } static inline unsigned int ip6_dst_mtu_forward(const struct dst_entry *dst) { struct inet6_dev *idev; unsigned int mtu; if (dst_metric_locked(dst, RTAX_MTU)) { mtu = dst_metric_raw(dst, RTAX_MTU); if (mtu) goto out; } mtu = IPV6_MIN_MTU; rcu_read_lock(); idev = __in6_dev_get(dst->dev); if (idev) mtu = idev->cnf.mtu6; rcu_read_unlock(); out: return mtu - lwtunnel_headroom(dst->lwtstate, mtu); } u32 ip6_mtu_from_fib6(const struct fib6_result *res, const struct in6_addr *daddr, const struct in6_addr *saddr); struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw, struct net_device *dev, struct sk_buff *skb, const void *daddr); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * ALSA sequencer Memory Manager * Copyright (c) 1998 by Frank van de Pol <fvdpol@coil.demon.nl> */ #ifndef __SND_SEQ_MEMORYMGR_H #define __SND_SEQ_MEMORYMGR_H #include <sound/seq_kernel.h> #include <linux/poll.h> struct snd_info_buffer; /* container for sequencer event (internal use) */ struct snd_seq_event_cell { struct snd_seq_event event; struct snd_seq_pool *pool; /* used pool */ struct snd_seq_event_cell *next; /* next cell */ }; /* design note: the pool is a contiguous block of memory, if we dynamicly want to add additional cells to the pool be better store this in another pool as we need to know the base address of the pool when releasing memory. */ struct snd_seq_pool { struct snd_seq_event_cell *ptr; /* pointer to first event chunk */ struct snd_seq_event_cell *free; /* pointer to the head of the free list */ int total_elements; /* pool size actually allocated */ atomic_t counter; /* cells free */ int size; /* pool size to be allocated */ int room; /* watermark for sleep/wakeup */ int closing; /* statistics */ int max_used; int event_alloc_nopool; int event_alloc_failures; int event_alloc_success; /* Write locking */ wait_queue_head_t output_sleep; /* Pool lock */ spinlock_t lock; }; void snd_seq_cell_free(struct snd_seq_event_cell *cell); int snd_seq_event_dup(struct snd_seq_pool *pool, struct snd_seq_event *event, struct snd_seq_event_cell **cellp, int nonblock, struct file *file, struct mutex *mutexp); /* return number of unused (free) cells */ static inline int snd_seq_unused_cells(struct snd_seq_pool *pool) { return pool ? pool->total_elements - atomic_read(&pool->counter) : 0; } /* return total number of allocated cells */ static inline int snd_seq_total_cells(struct snd_seq_pool *pool) { return pool ? pool->total_elements : 0; } /* init pool - allocate events */ int snd_seq_pool_init(struct snd_seq_pool *pool); /* done pool - free events */ void snd_seq_pool_mark_closing(struct snd_seq_pool *pool); int snd_seq_pool_done(struct snd_seq_pool *pool); /* create pool */ struct snd_seq_pool *snd_seq_pool_new(int poolsize); /* remove pool */ int snd_seq_pool_delete(struct snd_seq_pool **pool); /* polling */ int snd_seq_pool_poll_wait(struct snd_seq_pool *pool, struct file *file, poll_table *wait); void snd_seq_info_pool(struct snd_info_buffer *buffer, struct snd_seq_pool *pool, char *space); #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_HOST_H #define _SCSI_SCSI_HOST_H #include <linux/device.h> #include <linux/list.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> struct block_device; struct completion; struct module; struct scsi_cmnd; struct scsi_device; struct scsi_host_cmd_pool; struct scsi_target; struct Scsi_Host; struct scsi_host_cmd_pool; struct scsi_transport_template; #define SG_ALL SG_CHUNK_SIZE #define MODE_UNKNOWN 0x00 #define MODE_INITIATOR 0x01 #define MODE_TARGET 0x02 struct scsi_host_template { struct module *module; const char *name; /* * The info function will return whatever useful information the * developer sees fit. If not provided, then the name field will * be used instead. * * Status: OPTIONAL */ const char *(* info)(struct Scsi_Host *); /* * Ioctl interface * * Status: OPTIONAL */ int (*ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #ifdef CONFIG_COMPAT /* * Compat handler. Handle 32bit ABI. * When unknown ioctl is passed return -ENOIOCTLCMD. * * Status: OPTIONAL */ int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #endif int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); /* * The queuecommand function is used to queue up a scsi * command block to the LLDD. When the driver finished * processing the command the done callback is invoked. * * If queuecommand returns 0, then the driver has accepted the * command. It must also push it to the HBA if the scsi_cmnd * flag SCMD_LAST is set, or if the driver does not implement * commit_rqs. The done() function must be called on the command * when the driver has finished with it. (you may call done on the * command before queuecommand returns, but in this case you * *must* return 0 from queuecommand). * * Queuecommand may also reject the command, in which case it may * not touch the command and must not call done() for it. * * There are two possible rejection returns: * * SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but * allow commands to other devices serviced by this host. * * SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this * host temporarily. * * For compatibility, any other non-zero return is treated the * same as SCSI_MLQUEUE_HOST_BUSY. * * NOTE: "temporarily" means either until the next command for# * this device/host completes, or a period of time determined by * I/O pressure in the system if there are no other outstanding * commands. * * STATUS: REQUIRED */ int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); /* * The commit_rqs function is used to trigger a hardware * doorbell after some requests have been queued with * queuecommand, when an error is encountered before sending * the request with SCMD_LAST set. * * STATUS: OPTIONAL */ void (*commit_rqs)(struct Scsi_Host *, u16); /* * This is an error handling strategy routine. You don't need to * define one of these if you don't want to - there is a default * routine that is present that should work in most cases. For those * driver authors that have the inclination and ability to write their * own strategy routine, this is where it is specified. Note - the * strategy routine is *ALWAYS* run in the context of the kernel eh * thread. Thus you are guaranteed to *NOT* be in an interrupt * handler when you execute this, and you are also guaranteed to * *NOT* have any other commands being queued while you are in the * strategy routine. When you return from this function, operations * return to normal. * * See scsi_error.c scsi_unjam_host for additional comments about * what this function should and should not be attempting to do. * * Status: REQUIRED (at least one of them) */ int (* eh_abort_handler)(struct scsi_cmnd *); int (* eh_device_reset_handler)(struct scsi_cmnd *); int (* eh_target_reset_handler)(struct scsi_cmnd *); int (* eh_bus_reset_handler)(struct scsi_cmnd *); int (* eh_host_reset_handler)(struct scsi_cmnd *); /* * Before the mid layer attempts to scan for a new device where none * currently exists, it will call this entry in your driver. Should * your driver need to allocate any structs or perform any other init * items in order to send commands to a currently unused target/lun * combo, then this is where you can perform those allocations. This * is specifically so that drivers won't have to perform any kind of * "is this a new device" checks in their queuecommand routine, * thereby making the hot path a bit quicker. * * Return values: 0 on success, non-0 on failure * * Deallocation: If we didn't find any devices at this ID, you will * get an immediate call to slave_destroy(). If we find something * here then you will get a call to slave_configure(), then the * device will be used for however long it is kept around, then when * the device is removed from the system (or * possibly at reboot * time), you will then get a call to slave_destroy(). This is * assuming you implement slave_configure and slave_destroy. * However, if you allocate memory and hang it off the device struct, * then you must implement the slave_destroy() routine at a minimum * in order to avoid leaking memory * each time a device is tore down. * * Status: OPTIONAL */ int (* slave_alloc)(struct scsi_device *); /* * Once the device has responded to an INQUIRY and we know the * device is online, we call into the low level driver with the * struct scsi_device *. If the low level device driver implements * this function, it *must* perform the task of setting the queue * depth on the device. All other tasks are optional and depend * on what the driver supports and various implementation details. * * Things currently recommended to be handled at this time include: * * 1. Setting the device queue depth. Proper setting of this is * described in the comments for scsi_change_queue_depth. * 2. Determining if the device supports the various synchronous * negotiation protocols. The device struct will already have * responded to INQUIRY and the results of the standard items * will have been shoved into the various device flag bits, eg. * device->sdtr will be true if the device supports SDTR messages. * 3. Allocating command structs that the device will need. * 4. Setting the default timeout on this device (if needed). * 5. Anything else the low level driver might want to do on a device * specific setup basis... * 6. Return 0 on success, non-0 on error. The device will be marked * as offline on error so that no access will occur. If you return * non-0, your slave_destroy routine will never get called for this * device, so don't leave any loose memory hanging around, clean * up after yourself before returning non-0 * * Status: OPTIONAL */ int (* slave_configure)(struct scsi_device *); /* * Immediately prior to deallocating the device and after all activity * has ceased the mid layer calls this point so that the low level * driver may completely detach itself from the scsi device and vice * versa. The low level driver is responsible for freeing any memory * it allocated in the slave_alloc or slave_configure calls. * * Status: OPTIONAL */ void (* slave_destroy)(struct scsi_device *); /* * Before the mid layer attempts to scan for a new device attached * to a target where no target currently exists, it will call this * entry in your driver. Should your driver need to allocate any * structs or perform any other init items in order to send commands * to a currently unused target, then this is where you can perform * those allocations. * * Return values: 0 on success, non-0 on failure * * Status: OPTIONAL */ int (* target_alloc)(struct scsi_target *); /* * Immediately prior to deallocating the target structure, and * after all activity to attached scsi devices has ceased, the * midlayer calls this point so that the driver may deallocate * and terminate any references to the target. * * Status: OPTIONAL */ void (* target_destroy)(struct scsi_target *); /* * If a host has the ability to discover targets on its own instead * of scanning the entire bus, it can fill in this function and * call scsi_scan_host(). This function will be called periodically * until it returns 1 with the scsi_host and the elapsed time of * the scan in jiffies. * * Status: OPTIONAL */ int (* scan_finished)(struct Scsi_Host *, unsigned long); /* * If the host wants to be called before the scan starts, but * after the midlayer has set up ready for the scan, it can fill * in this function. * * Status: OPTIONAL */ void (* scan_start)(struct Scsi_Host *); /* * Fill in this function to allow the queue depth of this host * to be changeable (on a per device basis). Returns either * the current queue depth setting (may be different from what * was passed in) or an error. An error should only be * returned if the requested depth is legal but the driver was * unable to set it. If the requested depth is illegal, the * driver should set and return the closest legal queue depth. * * Status: OPTIONAL */ int (* change_queue_depth)(struct scsi_device *, int); /* * This functions lets the driver expose the queue mapping * to the block layer. * * Status: OPTIONAL */ int (* map_queues)(struct Scsi_Host *shost); /* * Check if scatterlists need to be padded for DMA draining. * * Status: OPTIONAL */ bool (* dma_need_drain)(struct request *rq); /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by * the host adapter. Parameters: * size, device, list (heads, sectors, cylinders) * * Status: OPTIONAL */ int (* bios_param)(struct scsi_device *, struct block_device *, sector_t, int []); /* * This function is called when one or more partitions on the * device reach beyond the end of the device. * * Status: OPTIONAL */ void (*unlock_native_capacity)(struct scsi_device *); /* * Can be used to export driver statistics and other infos to the * world outside the kernel ie. userspace and it also provides an * interface to feed the driver with information. * * Status: OBSOLETE */ int (*show_info)(struct seq_file *, struct Scsi_Host *); int (*write_info)(struct Scsi_Host *, char *, int); /* * This is an optional routine that allows the transport to become * involved when a scsi io timer fires. The return value tells the * timer routine how to finish the io timeout handling. * * Status: OPTIONAL */ enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *); /* This is an optional routine that allows transport to initiate * LLD adapter or firmware reset using sysfs attribute. * * Return values: 0 on success, -ve value on failure. * * Status: OPTIONAL */ int (*host_reset)(struct Scsi_Host *shost, int reset_type); #define SCSI_ADAPTER_RESET 1 #define SCSI_FIRMWARE_RESET 2 /* * Name of proc directory */ const char *proc_name; /* * Used to store the procfs directory if a driver implements the * show_info method. */ struct proc_dir_entry *proc_dir; /* * This determines if we will use a non-interrupt driven * or an interrupt driven scheme. It is set to the maximum number * of simultaneous commands a single hw queue in HBA will accept. */ int can_queue; /* * In many instances, especially where disconnect / reconnect are * supported, our host also has an ID on the SCSI bus. If this is * the case, then it must be reserved. Please set this_id to -1 if * your setup is in single initiator mode, and the host lacks an * ID. */ int this_id; /* * This determines the degree to which the host adapter is capable * of scatter-gather. */ unsigned short sg_tablesize; unsigned short sg_prot_tablesize; /* * Set this if the host adapter has limitations beside segment count. */ unsigned int max_sectors; /* * Maximum size in bytes of a single segment. */ unsigned int max_segment_size; /* * DMA scatter gather segment boundary limit. A segment crossing this * boundary will be split in two. */ unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * This specifies "machine infinity" for host templates which don't * limit the transfer size. Note this limit represents an absolute * maximum, and may be over the transfer limits allowed for * individual devices (e.g. 256 for SCSI-1). */ #define SCSI_DEFAULT_MAX_SECTORS 1024 /* * True if this host adapter can make good use of linked commands. * This will allow more than one command to be queued to a given * unit on a given host. Set this to the maximum number of command * blocks to be provided for each device. Set this to 1 for one * command block per lun, 2 for two, etc. Do not set this to 0. * You should make sure that the host adapter will do the right thing * before you try setting this above 1. */ short cmd_per_lun; /* * present contains counter indicating how many boards of this * type were found when we did the scan. */ unsigned char present; /* If use block layer to manage tags, this is tag allocation policy */ int tag_alloc_policy; /* * Track QUEUE_FULL events and reduce queue depth on demand. */ unsigned track_queue_depth:1; /* * This specifies the mode that a LLD supports. */ unsigned supported_mode:2; /* * True if this host adapter uses unchecked DMA onto an ISA bus. */ unsigned unchecked_isa_dma:1; /* * True for emulated SCSI host adapters (e.g. ATAPI). */ unsigned emulated:1; /* * True if the low-level driver performs its own reset-settle delays. */ unsigned skip_settle_delay:1; /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* * Countdown for host blocking with no commands outstanding. */ unsigned int max_host_blocked; /* * Default value for the blocking. If the queue is empty, * host_blocked counts down in the request_fn until it restarts * host operations as zero is reached. * * FIXME: This should probably be a value in the template */ #define SCSI_DEFAULT_HOST_BLOCKED 7 /* * Pointer to the sysfs class properties for this host, NULL terminated. */ struct device_attribute **shost_attrs; /* * Pointer to the SCSI device properties for this host, NULL terminated. */ struct device_attribute **sdev_attrs; /* * Pointer to the SCSI device attribute groups for this host, * NULL terminated. */ const struct attribute_group **sdev_groups; /* * Vendor Identifier associated with the host * * Note: When specifying vendor_id, be sure to read the * Vendor Type and ID formatting requirements specified in * scsi_netlink.h */ u64 vendor_id; /* * Additional per-command data allocated for the driver. */ unsigned int cmd_size; struct scsi_host_cmd_pool *cmd_pool; /* Delay for runtime autosuspend */ int rpm_autosuspend_delay; }; /* * Temporary #define for host lock push down. Can be removed when all * drivers have been updated to take advantage of unlocked * queuecommand. * */ #define DEF_SCSI_QCMD(func_name) \ int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd) \ { \ unsigned long irq_flags; \ int rc; \ spin_lock_irqsave(shost->host_lock, irq_flags); \ rc = func_name##_lck (cmd, cmd->scsi_done); \ spin_unlock_irqrestore(shost->host_lock, irq_flags); \ return rc; \ } /* * shost state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_host_set_state() */ enum scsi_host_state { SHOST_CREATED = 1, SHOST_RUNNING, SHOST_CANCEL, SHOST_DEL, SHOST_RECOVERY, SHOST_CANCEL_RECOVERY, SHOST_DEL_RECOVERY, }; struct Scsi_Host { /* * __devices is protected by the host_lock, but you should * usually use scsi_device_lookup / shost_for_each_device * to access it and don't care about locking yourself. * In the rare case of being in irq context you can use * their __ prefixed variants with the lock held. NEVER * access this list directly from a driver. */ struct list_head __devices; struct list_head __targets; struct list_head starved_list; spinlock_t default_lock; spinlock_t *host_lock; struct mutex scan_mutex;/* serialize scanning activity */ struct list_head eh_cmd_q; struct task_struct * ehandler; /* Error recovery thread. */ struct completion * eh_action; /* Wait for specific actions on the host. */ wait_queue_head_t host_wait; struct scsi_host_template *hostt; struct scsi_transport_template *transportt; /* Area to keep a shared tag map */ struct blk_mq_tag_set tag_set; atomic_t host_blocked; unsigned int host_failed; /* commands that failed. protected by host_lock */ unsigned int host_eh_scheduled; /* EH scheduled without command */ unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */ /* next two fields are used to bound the time spent in error handling */ int eh_deadline; unsigned long last_reset; /* * These three parameters can be used to allow for wide scsi, * and for host adapters that support multiple busses * The last two should be set to 1 more than the actual max id * or lun (e.g. 8 for SCSI parallel systems). */ unsigned int max_channel; unsigned int max_id; u64 max_lun; /* * This is a unique identifier that must be assigned so that we * have some way of identifying each detected host adapter properly * and uniquely. For hosts that do not support more than one card * in the system at one time, this does not need to be set. It is * initialized to 0 in scsi_register. */ unsigned int unique_id; /* * The maximum length of SCSI commands that this host can accept. * Probably 12 for most host adapters, but could be 16 for others. * or 260 if the driver supports variable length cdbs. * For drivers that don't set this field, a value of 12 is * assumed. */ unsigned short max_cmd_len; int this_id; int can_queue; short cmd_per_lun; short unsigned int sg_tablesize; short unsigned int sg_prot_tablesize; unsigned int max_sectors; unsigned int max_segment_size; unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * In scsi-mq mode, the number of hardware queues supported by the LLD. * * Note: it is assumed that each hardware queue has a queue depth of * can_queue. In other words, the total queue depth per host * is nr_hw_queues * can_queue. However, for when host_tagset is set, * the total queue depth is can_queue. */ unsigned nr_hw_queues; unsigned active_mode:2; unsigned unchecked_isa_dma:1; /* * Host has requested that no further requests come through for the * time being. */ unsigned host_self_blocked:1; /* * Host uses correct SCSI ordering not PC ordering. The bit is * set for the minority of drivers whose authors actually read * the spec ;). */ unsigned reverse_ordering:1; /* Task mgmt function in progress */ unsigned tmf_in_progress:1; /* Asynchronous scan in progress */ unsigned async_scan:1; /* Don't resume host in EH */ unsigned eh_noresume:1; /* The controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* Host responded with short (<36 bytes) INQUIRY result */ unsigned short_inquiry:1; /* The transport requires the LUN bits NOT to be stored in CDB[1] */ unsigned no_scsi2_lun_in_cdb:1; /* * Optional work queue to be utilized by the transport */ char work_q_name[20]; struct workqueue_struct *work_q; /* * Task management function work queue */ struct workqueue_struct *tmf_work_q; /* * Value host_blocked counts down from */ unsigned int max_host_blocked; /* Protection Information */ unsigned int prot_capabilities; unsigned char prot_guard_type; /* legacy crap */ unsigned long base; unsigned long io_port; unsigned char n_io_port; unsigned char dma_channel; unsigned int irq; enum scsi_host_state shost_state; /* ldm bits */ struct device shost_gendev, shost_dev; /* * Points to the transport data (if any) which is allocated * separately */ void *shost_data; /* * Points to the physical bus device we'd use to do DMA * Needed just in case we have virtual hosts. */ struct device *dma_dev; /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. */ unsigned long hostdata[] /* Used for storage of host specific stuff */ __attribute__ ((aligned (sizeof(unsigned long)))); }; #define class_to_shost(d) \ container_of(d, struct Scsi_Host, shost_dev) #define shost_printk(prefix, shost, fmt, a...) \ dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a) static inline void *shost_priv(struct Scsi_Host *shost) { return (void *)shost->hostdata; } int scsi_is_host_device(const struct device *); static inline struct Scsi_Host *dev_to_shost(struct device *dev) { while (!scsi_is_host_device(dev)) { if (!dev->parent) return NULL; dev = dev->parent; } return container_of(dev, struct Scsi_Host, shost_gendev); } static inline int scsi_host_in_recovery(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RECOVERY || shost->shost_state == SHOST_CANCEL_RECOVERY || shost->shost_state == SHOST_DEL_RECOVERY || shost->tmf_in_progress; } extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); extern void scsi_flush_work(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int); extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, struct device *, struct device *); extern void scsi_scan_host(struct Scsi_Host *); extern void scsi_rescan_device(struct device *); extern void scsi_remove_host(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *); extern int scsi_host_busy(struct Scsi_Host *shost); extern void scsi_host_put(struct Scsi_Host *t); extern struct Scsi_Host *scsi_host_lookup(unsigned short); extern const char *scsi_host_state_name(enum scsi_host_state); extern void scsi_host_complete_all_commands(struct Scsi_Host *shost, int status); static inline int __must_check scsi_add_host(struct Scsi_Host *host, struct device *dev) { return scsi_add_host_with_dma(host, dev, dev); } static inline struct device *scsi_get_device(struct Scsi_Host *shost) { return shost->shost_gendev.parent; } /** * scsi_host_scan_allowed - Is scanning of this host allowed * @shost: Pointer to Scsi_Host. **/ static inline int scsi_host_scan_allowed(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RUNNING || shost->shost_state == SHOST_RECOVERY; } extern void scsi_unblock_requests(struct Scsi_Host *); extern void scsi_block_requests(struct Scsi_Host *); extern int scsi_host_block(struct Scsi_Host *shost); extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state); void scsi_host_busy_iter(struct Scsi_Host *, bool (*fn)(struct scsi_cmnd *, void *, bool), void *priv); struct class_container; /* * These two functions are used to allocate and free a pseudo device * which will connect to the host adapter itself rather than any * physical device. You must deallocate when you are done with the * thing. This physical pseudo-device isn't real and won't be available * from any high-level drivers. */ extern void scsi_free_host_dev(struct scsi_device *); extern struct scsi_device *scsi_get_host_dev(struct Scsi_Host *); /* * DIF defines the exchange of protection information between * initiator and SBC block device. * * DIX defines the exchange of protection information between OS and * initiator. */ enum scsi_host_prot_capabilities { SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */ SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */ SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */ SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */ SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */ SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */ SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */ }; /* * SCSI hosts which support the Data Integrity Extensions must * indicate their capabilities by setting the prot_capabilities using * this call. */ static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask) { shost->prot_capabilities = mask; } static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost) { return shost->prot_capabilities; } static inline int scsi_host_prot_dma(struct Scsi_Host *shost) { return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION; } static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) { static unsigned char cap[] = { 0, SHOST_DIF_TYPE1_PROTECTION, SHOST_DIF_TYPE2_PROTECTION, SHOST_DIF_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type] ? target_type : 0; } static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type) { #if defined(CONFIG_BLK_DEV_INTEGRITY) static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION, SHOST_DIX_TYPE1_PROTECTION, SHOST_DIX_TYPE2_PROTECTION, SHOST_DIX_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type]; #endif return 0; } /* * All DIX-capable initiators must support the T10-mandated CRC * checksum. Controllers can optionally implement the IP checksum * scheme which has much lower impact on system performance. Note * that the main rationale for the checksum is to match integrity * metadata with data. Detecting bit errors are a job for ECC memory * and buses. */ enum scsi_host_guard_type { SHOST_DIX_GUARD_CRC = 1 << 0, SHOST_DIX_GUARD_IP = 1 << 1, }; static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type) { shost->prot_guard_type = type; } static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost) { return shost->prot_guard_type; } extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state); #endif /* _SCSI_SCSI_HOST_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 // SPDX-License-Identifier: GPL-2.0 /* * fs/ext4/mballoc.h * * Written by: Alex Tomas <alex@clusterfs.com> * */ #ifndef _EXT4_MBALLOC_H #define _EXT4_MBALLOC_H #include <linux/time.h> #include <linux/fs.h> #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/module.h> #include <linux/swap.h> #include <linux/proc_fs.h> #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4_jbd2.h" #include "ext4.h" /* * mb_debug() dynamic printk msgs could be used to debug mballoc code. */ #ifdef CONFIG_EXT4_DEBUG #define mb_debug(sb, fmt, ...) \ pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ current->comm, task_pid_nr(current), sb->s_id, \ __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else #define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ #define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ /* * How long mballoc can look for a best extent (in found extents) */ #define MB_DEFAULT_MAX_TO_SCAN 200 /* * How long mballoc must look for a best extent */ #define MB_DEFAULT_MIN_TO_SCAN 10 /* * with 'ext4_mb_stats' allocator will collect stats that will be * shown at umount. The collecting costs though! */ #define MB_DEFAULT_STATS 0 /* * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served * by the stream allocator, which purpose is to pack requests * as close each to other as possible to produce smooth I/O traffic * We use locality group prealloc space for stream request. * We can tune the same via /proc/fs/ext4/<parition>/stream_req */ #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ /* * for which requests use 2^N search using buddies */ #define MB_DEFAULT_ORDER2_REQS 2 /* * default group prealloc size 512 blocks */ #define MB_DEFAULT_GROUP_PREALLOC 512 /* * maximum length of inode prealloc list */ #define MB_DEFAULT_MAX_INODE_PREALLOC 512 struct ext4_free_data { /* this links the free block information from sb_info */ struct list_head efd_list; /* this links the free block information from group_info */ struct rb_node efd_node; /* group which free block extent belongs */ ext4_group_t efd_group; /* free block extent */ ext4_grpblk_t efd_start_cluster; ext4_grpblk_t efd_count; /* transaction which freed this extent */ tid_t efd_tid; }; struct ext4_prealloc_space { struct list_head pa_inode_list; struct list_head pa_group_list; union { struct list_head pa_tmp_list; struct rcu_head pa_rcu; } u; spinlock_t pa_lock; atomic_t pa_count; unsigned pa_deleted; ext4_fsblk_t pa_pstart; /* phys. block */ ext4_lblk_t pa_lstart; /* log. block */ ext4_grpblk_t pa_len; /* len of preallocated chunk */ ext4_grpblk_t pa_free; /* how many blocks are free */ unsigned short pa_type; /* pa type. inode or group */ spinlock_t *pa_obj_lock; struct inode *pa_inode; /* hack, for history only */ }; enum { MB_INODE_PA = 0, MB_GROUP_PA = 1 }; struct ext4_free_extent { ext4_lblk_t fe_logical; ext4_grpblk_t fe_start; /* In cluster units */ ext4_group_t fe_group; ext4_grpblk_t fe_len; /* In cluster units */ }; /* * Locality group: * we try to group all related changes together * so that writeback can flush/allocate them together as well * Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC * (512). We store prealloc space into the hash based on the pa_free blocks * order value.ie, fls(pa_free)-1; */ #define PREALLOC_TB_SIZE 10 struct ext4_locality_group { /* for allocator */ /* to serialize allocates */ struct mutex lg_mutex; /* list of preallocations */ struct list_head lg_prealloc_list[PREALLOC_TB_SIZE]; spinlock_t lg_prealloc_lock; }; struct ext4_allocation_context { struct inode *ac_inode; struct super_block *ac_sb; /* original request */ struct ext4_free_extent ac_o_ex; /* goal request (normalized ac_o_ex) */ struct ext4_free_extent ac_g_ex; /* the best found extent */ struct ext4_free_extent ac_b_ex; /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; __u16 ac_groups_scanned; __u16 ac_found; __u16 ac_tail; __u16 ac_buddy; __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; __u8 ac_2order; /* if request is to allocate 2^N blocks and * N > 0, the field stores N, otherwise 0 */ __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; #define AC_STATUS_CONTINUE 1 #define AC_STATUS_FOUND 2 #define AC_STATUS_BREAK 3 struct ext4_buddy { struct page *bd_buddy_page; void *bd_buddy; struct page *bd_bitmap_page; void *bd_bitmap; struct ext4_group_info *bd_info; struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; }; static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { return ext4_group_first_block_no(sb, fex->fe_group) + (fex->fe_start << EXT4_SB(sb)->s_cluster_bits); } typedef int (*ext4_mballoc_query_range_fn)( struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t len, void *priv); int ext4_mballoc_query_range( struct super_block *sb, ext4_group_t agno, ext4_grpblk_t start, ext4_grpblk_t end, ext4_mballoc_query_range_fn formatter, void *priv); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Generic associative array implementation. * * See Documentation/core-api/assoc_array.rst for information. * * Copyright (C) 2013 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #ifndef _LINUX_ASSOC_ARRAY_H #define _LINUX_ASSOC_ARRAY_H #ifdef CONFIG_ASSOCIATIVE_ARRAY #include <linux/types.h> #define ASSOC_ARRAY_KEY_CHUNK_SIZE BITS_PER_LONG /* Key data retrieved in chunks of this size */ /* * Generic associative array. */ struct assoc_array { struct assoc_array_ptr *root; /* The node at the root of the tree */ unsigned long nr_leaves_on_tree; }; /* * Operations on objects and index keys for use by array manipulation routines. */ struct assoc_array_ops { /* Method to get a chunk of an index key from caller-supplied data */ unsigned long (*get_key_chunk)(const void *index_key, int level); /* Method to get a piece of an object's index key */ unsigned long (*get_object_key_chunk)(const void *object, int level); /* Is this the object we're looking for? */ bool (*compare_object)(const void *object, const void *index_key); /* How different is an object from an index key, to a bit position in * their keys? (or -1 if they're the same) */ int (*diff_objects)(const void *object, const void *index_key); /* Method to free an object. */ void (*free_object)(void *object); }; /* * Access and manipulation functions. */ struct assoc_array_edit; static inline void assoc_array_init(struct assoc_array *array) { array->root = NULL; array->nr_leaves_on_tree = 0; } extern int assoc_array_iterate(const struct assoc_array *array, int (*iterator)(const void *object, void *iterator_data), void *iterator_data); extern void *assoc_array_find(const struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key); extern void assoc_array_destroy(struct assoc_array *array, const struct assoc_array_ops *ops); extern struct assoc_array_edit *assoc_array_insert(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key, void *object); extern void assoc_array_insert_set_object(struct assoc_array_edit *edit, void *object); extern struct assoc_array_edit *assoc_array_delete(struct assoc_array *array, const struct assoc_array_ops *ops, const void *index_key); extern struct assoc_array_edit *assoc_array_clear(struct assoc_array *array, const struct assoc_array_ops *ops); extern void assoc_array_apply_edit(struct assoc_array_edit *edit); extern void assoc_array_cancel_edit(struct assoc_array_edit *edit); extern int assoc_array_gc(struct assoc_array *array, const struct assoc_array_ops *ops, bool (*iterator)(void *object, void *iterator_data), void *iterator_data); #endif /* CONFIG_ASSOCIATIVE_ARRAY */ #endif /* _LINUX_ASSOC_ARRAY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ #include <linux/list.h> #include <linux/fsnotify.h> #include <linux/srcu.h> #include <linux/types.h> #include "../mount.h" static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct inode, i_fsnotify_marks); } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct super_block, s_fsnotify_marks); } /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); /* Destroy all marks attached to an object via connector */ extern void fsnotify_destroy_marks(fsnotify_connp_t *connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); /* allocate and destroy and event holder to attach events to notification/access queues */ extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_STRINGHASH_H #define __LINUX_STRINGHASH_H #include <linux/compiler.h> /* For __pure */ #include <linux/types.h> /* For u32, u64 */ #include <linux/hash.h> /* * Routines for hashing strings of bytes to a 32-bit hash value. * * These hash functions are NOT GUARANTEED STABLE between kernel * versions, architectures, or even repeated boots of the same kernel. * (E.g. they may depend on boot-time hardware detection or be * deliberately randomized.) * * They are also not intended to be secure against collisions caused by * malicious inputs; much slower hash functions are required for that. * * They are optimized for pathname components, meaning short strings. * Even if a majority of files have longer names, the dynamic profile of * pathname components skews short due to short directory names. * (E.g. /usr/lib/libsesquipedalianism.so.3.141.) */ /* * Version 1: one byte at a time. Example of use: * * unsigned long hash = init_name_hash; * while (*p) * hash = partial_name_hash(tolower(*p++), hash); * hash = end_name_hash(hash); * * Although this is designed for bytes, fs/hfsplus/unicode.c * abuses it to hash 16-bit values. */ /* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ #define init_name_hash(salt) (unsigned long)(salt) /* partial hash update function. Assume roughly 4 bits per character */ static inline unsigned long partial_name_hash(unsigned long c, unsigned long prevhash) { return (prevhash + (c << 4) + (c >> 4)) * 11; } /* * Finally: cut down the number of bits to a int value (and try to avoid * losing bits). This also has the property (wanted by the dcache) * that the msbits make a good hash table index. */ static inline unsigned int end_name_hash(unsigned long hash) { return hash_long(hash, 32); } /* * Version 2: One word (32 or 64 bits) at a time. * If CONFIG_DCACHE_WORD_ACCESS is defined (meaning <asm/word-at-a-time.h> * exists, which describes major Linux platforms like x86 and ARM), then * this computes a different hash function much faster. * * If not set, this falls back to a wrapper around the preceding. */ extern unsigned int __pure full_name_hash(const void *salt, const char *, unsigned int); /* * A hash_len is a u64 with the hash of a string in the low * half and the length in the high half. */ #define hashlen_hash(hashlen) ((u32)(hashlen)) #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) #define hashlen_create(hash, len) ((u64)(len)<<32 | (u32)(hash)) /* Return the "hash_len" (hash and length) of a null-terminated string */ extern u64 __pure hashlen_string(const void *salt, const char *name); #endif /* __LINUX_STRINGHASH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __CFG80211_RDEV_OPS #define __CFG80211_RDEV_OPS #include <linux/rtnetlink.h> #include <net/cfg80211.h> #include "core.h" #include "trace.h" static inline int rdev_suspend(struct cfg80211_registered_device *rdev, struct cfg80211_wowlan *wowlan) { int ret; trace_rdev_suspend(&rdev->wiphy, wowlan); ret = rdev->ops->suspend(&rdev->wiphy, wowlan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_resume(struct cfg80211_registered_device *rdev) { int ret; trace_rdev_resume(&rdev->wiphy); ret = rdev->ops->resume(&rdev->wiphy); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_set_wakeup(struct cfg80211_registered_device *rdev, bool enabled) { trace_rdev_set_wakeup(&rdev->wiphy, enabled); rdev->ops->set_wakeup(&rdev->wiphy, enabled); trace_rdev_return_void(&rdev->wiphy); } static inline struct wireless_dev *rdev_add_virtual_intf(struct cfg80211_registered_device *rdev, char *name, unsigned char name_assign_type, enum nl80211_iftype type, struct vif_params *params) { struct wireless_dev *ret; trace_rdev_add_virtual_intf(&rdev->wiphy, name, type); ret = rdev->ops->add_virtual_intf(&rdev->wiphy, name, name_assign_type, type, params); trace_rdev_return_wdev(&rdev->wiphy, ret); return ret; } static inline int rdev_del_virtual_intf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_del_virtual_intf(&rdev->wiphy, wdev); ret = rdev->ops->del_virtual_intf(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_virtual_intf(struct cfg80211_registered_device *rdev, struct net_device *dev, enum nl80211_iftype type, struct vif_params *params) { int ret; trace_rdev_change_virtual_intf(&rdev->wiphy, dev, type); ret = rdev->ops->change_virtual_intf(&rdev->wiphy, dev, type, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr, struct key_params *params) { int ret; trace_rdev_add_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr, params->mode); ret = rdev->ops->add_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr, void *cookie, void (*callback)(void *cookie, struct key_params*)) { int ret; trace_rdev_get_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr); ret = rdev->ops->get_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr, cookie, callback); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr) { int ret; trace_rdev_del_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr); ret = rdev->ops->del_key(&rdev->wiphy, netdev, key_index, pairwise, mac_addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, u8 key_index, bool unicast, bool multicast) { int ret; trace_rdev_set_default_key(&rdev->wiphy, netdev, key_index, unicast, multicast); ret = rdev->ops->set_default_key(&rdev->wiphy, netdev, key_index, unicast, multicast); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_mgmt_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, u8 key_index) { int ret; trace_rdev_set_default_mgmt_key(&rdev->wiphy, netdev, key_index); ret = rdev->ops->set_default_mgmt_key(&rdev->wiphy, netdev, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_default_beacon_key(struct cfg80211_registered_device *rdev, struct net_device *netdev, u8 key_index) { int ret; trace_rdev_set_default_beacon_key(&rdev->wiphy, netdev, key_index); ret = rdev->ops->set_default_beacon_key(&rdev->wiphy, netdev, key_index); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_ap(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ap_settings *settings) { int ret; trace_rdev_start_ap(&rdev->wiphy, dev, settings); ret = rdev->ops->start_ap(&rdev->wiphy, dev, settings); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_beacon(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_beacon_data *info) { int ret; trace_rdev_change_beacon(&rdev->wiphy, dev, info); ret = rdev->ops->change_beacon(&rdev->wiphy, dev, info); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_stop_ap(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_stop_ap(&rdev->wiphy, dev); ret = rdev->ops->stop_ap(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_add_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->add_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_station(struct cfg80211_registered_device *rdev, struct net_device *dev, struct station_del_parameters *params) { int ret; trace_rdev_del_station(&rdev->wiphy, dev, params); ret = rdev->ops->del_station(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_station(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *mac, struct station_parameters *params) { int ret; trace_rdev_change_station(&rdev->wiphy, dev, mac, params); ret = rdev->ops->change_station(&rdev->wiphy, dev, mac, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_station(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_get_station(&rdev->wiphy, dev, mac); ret = rdev->ops->get_station(&rdev->wiphy, dev, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_dump_station(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *mac, struct station_info *sinfo) { int ret; trace_rdev_dump_station(&rdev->wiphy, dev, idx, mac); ret = rdev->ops->dump_station(&rdev->wiphy, dev, idx, mac, sinfo); trace_rdev_return_int_station_info(&rdev->wiphy, ret, sinfo); return ret; } static inline int rdev_add_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_add_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst) { int ret; trace_rdev_del_mpath(&rdev->wiphy, dev, dst); ret = rdev->ops->del_mpath(&rdev->wiphy, dev, dst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop) { int ret; trace_rdev_change_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpath(&rdev->wiphy, dev, dst, next_hop); ret = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_get_mpp(&rdev->wiphy, dev, dst, mpp); ret = rdev->ops->get_mpp(&rdev->wiphy, dev, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpath(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *next_hop, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop); ret = rdev->ops->dump_mpath(&rdev->wiphy, dev, idx, dst, next_hop, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_dump_mpp(struct cfg80211_registered_device *rdev, struct net_device *dev, int idx, u8 *dst, u8 *mpp, struct mpath_info *pinfo) { int ret; trace_rdev_dump_mpp(&rdev->wiphy, dev, idx, dst, mpp); ret = rdev->ops->dump_mpp(&rdev->wiphy, dev, idx, dst, mpp, pinfo); trace_rdev_return_int_mpath_info(&rdev->wiphy, ret, pinfo); return ret; } static inline int rdev_get_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct mesh_config *conf) { int ret; trace_rdev_get_mesh_config(&rdev->wiphy, dev); ret = rdev->ops->get_mesh_config(&rdev->wiphy, dev, conf); trace_rdev_return_int_mesh_config(&rdev->wiphy, ret, conf); return ret; } static inline int rdev_update_mesh_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 mask, const struct mesh_config *nconf) { int ret; trace_rdev_update_mesh_config(&rdev->wiphy, dev, mask, nconf); ret = rdev->ops->update_mesh_config(&rdev->wiphy, dev, mask, nconf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev, const struct mesh_config *conf, const struct mesh_setup *setup) { int ret; trace_rdev_join_mesh(&rdev->wiphy, dev, conf, setup); ret = rdev->ops->join_mesh(&rdev->wiphy, dev, conf, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_mesh(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_mesh(&rdev->wiphy, dev); ret = rdev->ops->leave_mesh(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ocb_setup *setup) { int ret; trace_rdev_join_ocb(&rdev->wiphy, dev, setup); ret = rdev->ops->join_ocb(&rdev->wiphy, dev, setup); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ocb(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ocb(&rdev->wiphy, dev); ret = rdev->ops->leave_ocb(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_change_bss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct bss_parameters *params) { int ret; trace_rdev_change_bss(&rdev->wiphy, dev, params); ret = rdev->ops->change_bss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_txq_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_txq_params *params) { int ret; trace_rdev_set_txq_params(&rdev->wiphy, dev, params); ret = rdev->ops->set_txq_params(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_libertas_set_mesh_channel(struct cfg80211_registered_device *rdev, struct net_device *dev, struct ieee80211_channel *chan) { int ret; trace_rdev_libertas_set_mesh_channel(&rdev->wiphy, dev, chan); ret = rdev->ops->libertas_set_mesh_channel(&rdev->wiphy, dev, chan); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_monitor_channel(struct cfg80211_registered_device *rdev, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_monitor_channel(&rdev->wiphy, chandef); ret = rdev->ops->set_monitor_channel(&rdev->wiphy, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_scan(struct cfg80211_registered_device *rdev, struct cfg80211_scan_request *request) { int ret; trace_rdev_scan(&rdev->wiphy, request); ret = rdev->ops->scan(&rdev->wiphy, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_abort_scan(&rdev->wiphy, wdev); rdev->ops->abort_scan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_auth_request *req) { int ret; trace_rdev_auth(&rdev->wiphy, dev, req); ret = rdev->ops->auth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_assoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_assoc_request *req) { int ret; trace_rdev_assoc(&rdev->wiphy, dev, req); ret = rdev->ops->assoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_deauth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_deauth_request *req) { int ret; trace_rdev_deauth(&rdev->wiphy, dev, req); ret = rdev->ops->deauth(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disassoc(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_disassoc_request *req) { int ret; trace_rdev_disassoc(&rdev->wiphy, dev, req); ret = rdev->ops->disassoc(&rdev->wiphy, dev, req); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_connect(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme) { int ret; trace_rdev_connect(&rdev->wiphy, dev, sme); ret = rdev->ops->connect(&rdev->wiphy, dev, sme); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_connect_params(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_connect_params *sme, u32 changed) { int ret; trace_rdev_update_connect_params(&rdev->wiphy, dev, sme, changed); ret = rdev->ops->update_connect_params(&rdev->wiphy, dev, sme, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_disconnect(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 reason_code) { int ret; trace_rdev_disconnect(&rdev->wiphy, dev, reason_code); ret = rdev->ops->disconnect(&rdev->wiphy, dev, reason_code); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_join_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ibss_params *params) { int ret; trace_rdev_join_ibss(&rdev->wiphy, dev, params); ret = rdev->ops->join_ibss(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_leave_ibss(struct cfg80211_registered_device *rdev, struct net_device *dev) { int ret; trace_rdev_leave_ibss(&rdev->wiphy, dev); ret = rdev->ops->leave_ibss(&rdev->wiphy, dev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_wiphy_params(struct cfg80211_registered_device *rdev, u32 changed) { int ret; if (!rdev->ops->set_wiphy_params) return -EOPNOTSUPP; trace_rdev_set_wiphy_params(&rdev->wiphy, changed); ret = rdev->ops->set_wiphy_params(&rdev->wiphy, changed); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_tx_power_setting type, int mbm) { int ret; trace_rdev_set_tx_power(&rdev->wiphy, wdev, type, mbm); ret = rdev->ops->set_tx_power(&rdev->wiphy, wdev, type, mbm); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_tx_power(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, int *dbm) { int ret; trace_rdev_get_tx_power(&rdev->wiphy, wdev); ret = rdev->ops->get_tx_power(&rdev->wiphy, wdev, dbm); trace_rdev_return_int_int(&rdev->wiphy, ret, *dbm); return ret; } static inline int rdev_set_wds_peer(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr) { int ret; trace_rdev_set_wds_peer(&rdev->wiphy, dev, addr); ret = rdev->ops->set_wds_peer(&rdev->wiphy, dev, addr); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_multicast_to_unicast(struct cfg80211_registered_device *rdev, struct net_device *dev, const bool enabled) { int ret; trace_rdev_set_multicast_to_unicast(&rdev->wiphy, dev, enabled); ret = rdev->ops->set_multicast_to_unicast(&rdev->wiphy, dev, enabled); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_txq_stats(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_txq_stats *txqstats) { int ret; trace_rdev_get_txq_stats(&rdev->wiphy, wdev); ret = rdev->ops->get_txq_stats(&rdev->wiphy, wdev, txqstats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_rfkill_poll(struct cfg80211_registered_device *rdev) { trace_rdev_rfkill_poll(&rdev->wiphy); rdev->ops->rfkill_poll(&rdev->wiphy); trace_rdev_return_void(&rdev->wiphy); } #ifdef CONFIG_NL80211_TESTMODE static inline int rdev_testmode_cmd(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, void *data, int len) { int ret; trace_rdev_testmode_cmd(&rdev->wiphy, wdev); ret = rdev->ops->testmode_cmd(&rdev->wiphy, wdev, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_testmode_dump(struct cfg80211_registered_device *rdev, struct sk_buff *skb, struct netlink_callback *cb, void *data, int len) { int ret; trace_rdev_testmode_dump(&rdev->wiphy); ret = rdev->ops->testmode_dump(&rdev->wiphy, skb, cb, data, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } #endif static inline int rdev_set_bitrate_mask(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, const struct cfg80211_bitrate_mask *mask) { int ret; trace_rdev_set_bitrate_mask(&rdev->wiphy, dev, peer, mask); ret = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, peer, mask); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_dump_survey(struct cfg80211_registered_device *rdev, struct net_device *netdev, int idx, struct survey_info *info) { int ret; trace_rdev_dump_survey(&rdev->wiphy, netdev, idx); ret = rdev->ops->dump_survey(&rdev->wiphy, netdev, idx, info); if (ret < 0) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_survey_info(&rdev->wiphy, ret, info); return ret; } static inline int rdev_set_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_set_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->set_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_pmksa *pmksa) { int ret; trace_rdev_del_pmksa(&rdev->wiphy, netdev, pmksa); ret = rdev->ops->del_pmksa(&rdev->wiphy, netdev, pmksa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_flush_pmksa(struct cfg80211_registered_device *rdev, struct net_device *netdev) { int ret; trace_rdev_flush_pmksa(&rdev->wiphy, netdev); ret = rdev->ops->flush_pmksa(&rdev->wiphy, netdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct ieee80211_channel *chan, unsigned int duration, u64 *cookie) { int ret; trace_rdev_remain_on_channel(&rdev->wiphy, wdev, chan, duration); ret = rdev->ops->remain_on_channel(&rdev->wiphy, wdev, chan, duration, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_cancel_remain_on_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); ret = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_mgmt_tx_params *params, u64 *cookie) { int ret; trace_rdev_mgmt_tx(&rdev->wiphy, wdev, params); ret = rdev->ops->mgmt_tx(&rdev->wiphy, wdev, params, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_tx_control_port(struct cfg80211_registered_device *rdev, struct net_device *dev, const void *buf, size_t len, const u8 *dest, __be16 proto, const bool noencrypt, u64 *cookie) { int ret; trace_rdev_tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt); ret = rdev->ops->tx_control_port(&rdev->wiphy, dev, buf, len, dest, proto, noencrypt, cookie); if (cookie) trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); else trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_mgmt_tx_cancel_wait(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { int ret; trace_rdev_mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); ret = rdev->ops->mgmt_tx_cancel_wait(&rdev->wiphy, wdev, cookie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_power_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, bool enabled, int timeout) { int ret; trace_rdev_set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); ret = rdev->ops->set_power_mgmt(&rdev->wiphy, dev, enabled, timeout); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 rssi_thold, u32 rssi_hyst) { int ret; trace_rdev_set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); ret = rdev->ops->set_cqm_rssi_config(&rdev->wiphy, dev, rssi_thold, rssi_hyst); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_rssi_range_config(struct cfg80211_registered_device *rdev, struct net_device *dev, s32 low, s32 high) { int ret; trace_rdev_set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); ret = rdev->ops->set_cqm_rssi_range_config(&rdev->wiphy, dev, low, high); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_cqm_txe_config(struct cfg80211_registered_device *rdev, struct net_device *dev, u32 rate, u32 pkts, u32 intvl) { int ret; trace_rdev_set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); ret = rdev->ops->set_cqm_txe_config(&rdev->wiphy, dev, rate, pkts, intvl); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_update_mgmt_frame_registrations(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct mgmt_frame_regs *upd) { might_sleep(); trace_rdev_update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); if (rdev->ops->update_mgmt_frame_registrations) rdev->ops->update_mgmt_frame_registrations(&rdev->wiphy, wdev, upd); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_antenna(struct cfg80211_registered_device *rdev, u32 tx_ant, u32 rx_ant) { int ret; trace_rdev_set_antenna(&rdev->wiphy, tx_ant, rx_ant); ret = rdev->ops->set_antenna(&rdev->wiphy, tx_ant, rx_ant); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_antenna(struct cfg80211_registered_device *rdev, u32 *tx_ant, u32 *rx_ant) { int ret; trace_rdev_get_antenna(&rdev->wiphy); ret = rdev->ops->get_antenna(&rdev->wiphy, tx_ant, rx_ant); if (ret) trace_rdev_return_int(&rdev->wiphy, ret); else trace_rdev_return_int_tx_rx(&rdev->wiphy, ret, *tx_ant, *rx_ant); return ret; } static inline int rdev_sched_scan_start(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_sched_scan_request *request) { int ret; trace_rdev_sched_scan_start(&rdev->wiphy, dev, request->reqid); ret = rdev->ops->sched_scan_start(&rdev->wiphy, dev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_sched_scan_stop(struct cfg80211_registered_device *rdev, struct net_device *dev, u64 reqid) { int ret; trace_rdev_sched_scan_stop(&rdev->wiphy, dev, reqid); ret = rdev->ops->sched_scan_stop(&rdev->wiphy, dev, reqid); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_rekey_data(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_gtk_rekey_data *data) { int ret; trace_rdev_set_rekey_data(&rdev->wiphy, dev); ret = rdev->ops->set_rekey_data(&rdev->wiphy, dev, data); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_mgmt(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, u8 action_code, u8 dialog_token, u16 status_code, u32 peer_capability, bool initiator, const u8 *buf, size_t len) { int ret; trace_rdev_tdls_mgmt(&rdev->wiphy, dev, peer, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); ret = rdev->ops->tdls_mgmt(&rdev->wiphy, dev, peer, action_code, dialog_token, status_code, peer_capability, initiator, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_oper(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 *peer, enum nl80211_tdls_operation oper) { int ret; trace_rdev_tdls_oper(&rdev->wiphy, dev, peer, oper); ret = rdev->ops->tdls_oper(&rdev->wiphy, dev, peer, oper); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_client(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u64 *cookie) { int ret; trace_rdev_probe_client(&rdev->wiphy, dev, peer); ret = rdev->ops->probe_client(&rdev->wiphy, dev, peer, cookie); trace_rdev_return_int_cookie(&rdev->wiphy, ret, *cookie); return ret; } static inline int rdev_set_noack_map(struct cfg80211_registered_device *rdev, struct net_device *dev, u16 noack_map) { int ret; trace_rdev_set_noack_map(&rdev->wiphy, dev, noack_map); ret = rdev->ops->set_noack_map(&rdev->wiphy, dev, noack_map); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_channel(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_get_channel(&rdev->wiphy, wdev); ret = rdev->ops->get_channel(&rdev->wiphy, wdev, chandef); trace_rdev_return_chandef(&rdev->wiphy, ret, chandef); return ret; } static inline int rdev_start_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { int ret; trace_rdev_start_p2p_device(&rdev->wiphy, wdev); ret = rdev->ops->start_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_p2p_device(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_p2p_device(&rdev->wiphy, wdev); rdev->ops->stop_p2p_device(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf) { int ret; trace_rdev_start_nan(&rdev->wiphy, wdev, conf); ret = rdev->ops->start_nan(&rdev->wiphy, wdev, conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_stop_nan(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_stop_nan(&rdev->wiphy, wdev); rdev->ops->stop_nan(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_add_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_func *nan_func) { int ret; trace_rdev_add_nan_func(&rdev->wiphy, wdev, nan_func); ret = rdev->ops->add_nan_func(&rdev->wiphy, wdev, nan_func); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_del_nan_func(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, u64 cookie) { trace_rdev_del_nan_func(&rdev->wiphy, wdev, cookie); rdev->ops->del_nan_func(&rdev->wiphy, wdev, cookie); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_nan_change_conf(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_nan_conf *conf, u32 changes) { int ret; trace_rdev_nan_change_conf(&rdev->wiphy, wdev, conf, changes); if (rdev->ops->nan_change_conf) ret = rdev->ops->nan_change_conf(&rdev->wiphy, wdev, conf, changes); else ret = -ENOTSUPP; trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_mac_acl(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_acl_data *params) { int ret; trace_rdev_set_mac_acl(&rdev->wiphy, dev, params); ret = rdev->ops->set_mac_acl(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_update_ft_ies(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_ft_ies_params *ftie) { int ret; trace_rdev_update_ft_ies(&rdev->wiphy, dev, ftie); ret = rdev->ops->update_ft_ies(&rdev->wiphy, dev, ftie); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_crit_proto_start(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, enum nl80211_crit_proto_id protocol, u16 duration) { int ret; trace_rdev_crit_proto_start(&rdev->wiphy, wdev, protocol, duration); ret = rdev->ops->crit_proto_start(&rdev->wiphy, wdev, protocol, duration); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_crit_proto_stop(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev) { trace_rdev_crit_proto_stop(&rdev->wiphy, wdev); rdev->ops->crit_proto_stop(&rdev->wiphy, wdev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_csa_settings *params) { int ret; trace_rdev_channel_switch(&rdev->wiphy, dev, params); ret = rdev->ops->channel_switch(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_qos_map(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_qos_map *qos_map) { int ret = -EOPNOTSUPP; if (rdev->ops->set_qos_map) { trace_rdev_set_qos_map(&rdev->wiphy, dev, qos_map); ret = rdev->ops->set_qos_map(&rdev->wiphy, dev, qos_map); trace_rdev_return_int(&rdev->wiphy, ret); } return ret; } static inline int rdev_set_ap_chanwidth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_set_ap_chanwidth(&rdev->wiphy, dev, chandef); ret = rdev->ops->set_ap_chanwidth(&rdev->wiphy, dev, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_add_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer, u8 user_prio, u16 admitted_time) { int ret = -EOPNOTSUPP; trace_rdev_add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); if (rdev->ops->add_tx_ts) ret = rdev->ops->add_tx_ts(&rdev->wiphy, dev, tsid, peer, user_prio, admitted_time); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_tx_ts(struct cfg80211_registered_device *rdev, struct net_device *dev, u8 tsid, const u8 *peer) { int ret = -EOPNOTSUPP; trace_rdev_del_tx_ts(&rdev->wiphy, dev, tsid, peer); if (rdev->ops->del_tx_ts) ret = rdev->ops->del_tx_ts(&rdev->wiphy, dev, tsid, peer); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_tdls_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr, u8 oper_class, struct cfg80211_chan_def *chandef) { int ret; trace_rdev_tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); ret = rdev->ops->tdls_channel_switch(&rdev->wiphy, dev, addr, oper_class, chandef); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *addr) { trace_rdev_tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); rdev->ops->tdls_cancel_channel_switch(&rdev->wiphy, dev, addr); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_start_radar_detection(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_chan_def *chandef, u32 cac_time_ms) { int ret = -ENOTSUPP; trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms); if (rdev->ops->start_radar_detection) ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev, chandef, cac_time_ms); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_end_cac(struct cfg80211_registered_device *rdev, struct net_device *dev) { trace_rdev_end_cac(&rdev->wiphy, dev); if (rdev->ops->end_cac) rdev->ops->end_cac(&rdev->wiphy, dev); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_set_mcast_rate(struct cfg80211_registered_device *rdev, struct net_device *dev, int mcast_rate[NUM_NL80211_BANDS]) { int ret = -ENOTSUPP; trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate); if (rdev->ops->set_mcast_rate) ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_coalesce(struct cfg80211_registered_device *rdev, struct cfg80211_coalesce *coalesce) { int ret = -ENOTSUPP; trace_rdev_set_coalesce(&rdev->wiphy, coalesce); if (rdev->ops->set_coalesce) ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_pmk_conf *pmk_conf) { int ret = -EOPNOTSUPP; trace_rdev_set_pmk(&rdev->wiphy, dev, pmk_conf); if (rdev->ops->set_pmk) ret = rdev->ops->set_pmk(&rdev->wiphy, dev, pmk_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_del_pmk(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *aa) { int ret = -EOPNOTSUPP; trace_rdev_del_pmk(&rdev->wiphy, dev, aa); if (rdev->ops->del_pmk) ret = rdev->ops->del_pmk(&rdev->wiphy, dev, aa); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_external_auth(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_external_auth_params *params) { int ret = -EOPNOTSUPP; trace_rdev_external_auth(&rdev->wiphy, dev, params); if (rdev->ops->external_auth) ret = rdev->ops->external_auth(&rdev->wiphy, dev, params); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_get_ftm_responder_stats(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_ftm_responder_stats *ftm_stats) { int ret = -EOPNOTSUPP; trace_rdev_get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); if (rdev->ops->get_ftm_responder_stats) ret = rdev->ops->get_ftm_responder_stats(&rdev->wiphy, dev, ftm_stats); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_start_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { int ret = -EOPNOTSUPP; trace_rdev_start_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->start_pmsr) ret = rdev->ops->start_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline void rdev_abort_pmsr(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev, struct cfg80211_pmsr_request *request) { trace_rdev_abort_pmsr(&rdev->wiphy, wdev, request->cookie); if (rdev->ops->abort_pmsr) rdev->ops->abort_pmsr(&rdev->wiphy, wdev, request); trace_rdev_return_void(&rdev->wiphy); } static inline int rdev_update_owe_info(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_update_owe_info *oweinfo) { int ret = -EOPNOTSUPP; trace_rdev_update_owe_info(&rdev->wiphy, dev, oweinfo); if (rdev->ops->update_owe_info) ret = rdev->ops->update_owe_info(&rdev->wiphy, dev, oweinfo); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_probe_mesh_link(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *dest, const void *buf, size_t len) { int ret; trace_rdev_probe_mesh_link(&rdev->wiphy, dev, dest, buf, len); ret = rdev->ops->probe_mesh_link(&rdev->wiphy, dev, buf, len); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_set_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, struct cfg80211_tid_config *tid_conf) { int ret; trace_rdev_set_tid_config(&rdev->wiphy, dev, tid_conf); ret = rdev->ops->set_tid_config(&rdev->wiphy, dev, tid_conf); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } static inline int rdev_reset_tid_config(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *peer, u8 tids) { int ret; trace_rdev_reset_tid_config(&rdev->wiphy, dev, peer, tids); ret = rdev->ops->reset_tid_config(&rdev->wiphy, dev, peer, tids); trace_rdev_return_int(&rdev->wiphy, ret); return ret; } #endif /* __CFG80211_RDEV_OPS */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP module. * * Version: @(#)udp.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : Turned on udp checksums. I don't want to * chase 'memory corruption' bugs that aren't! */ #ifndef _UDP_H #define _UDP_H #include <linux/list.h> #include <linux/bug.h> #include <net/inet_sock.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/indirect_call_wrapper.h> /** * struct udp_skb_cb - UDP(-Lite) private variables * * @header: private variables used by IPv4/IPv6 * @cscov: checksum coverage length (UDP-Lite only) * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u16 cscov; __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** * struct udp_hslot - UDP hash slot * * @head: head of list of sockets * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot *hash2; unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, struct net *net, unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() */ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { return &table->hash2[hash & table->mask]; } extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; struct sk_buff; /* * Generic checksumming routines for UDP(-Lite) v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { return (UDP_SKB_CB(skb)->cscov == skb->len ? __skb_checksum_complete(skb) : __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __udp_lib_checksum_complete(skb); } /** * udp_csum_outgoing - compute UDPv4/v6 checksum over fragments * @sk: socket we are writing to * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) */ static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), 0); skb_queue_walk(&sk->sk_write_queue, skb) { csum = csum_add(csum, skb->csum); } return csum; } static inline __wsum udp_csum(struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), skb->csum); for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) { csum = csum_add(csum, skb->csum); } return csum; } static inline __sum16 udp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); static inline void udp_csum_pull_header(struct sk_buff *skb) { if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header_fast(skb, off); if (skb_gro_header_hard(skb, hlen)) uh = skb_gro_header_slow(skb, hlen, off); return uh; } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); return 0; } void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, int min, int max, bool use_eth) { u32 hash; if (min >= max) { /* Use default range */ inet_get_local_port_range(net, &min, &max); } hash = skb_get_hash(skb); if (unlikely(!hash)) { if (use_eth) { /* Can't find a normal hash, caller has indicated an * Ethernet packet so use that to compute a hash. */ hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); } else { /* Can't derive any sort of hash for the packet, set * to some consistent random value. */ hash = udp_flow_hashrnd(); } } /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ hash ^= hash << 16; return htons((((u64) hash * (max - min)) >> 32) + min); } static inline int udp_rqueue_get(struct sock *sk) { return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); #endif } /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *err) { int off = 0; return __skb_recv_udp(sk, flags, noblock, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ struct udp_dev_scratch { /* skb->truesize and the stateless bit are embedded in a single field; * do not use a bitfield since the compiler emits better/smaller code * this way */ u32 _tsize_state; #if BITS_PER_LONG == 64 /* len and the bit needed to compute skb_csum_unnecessary * will be on cold cache lines at recvmsg time. * skb->len can be stored on 16 bits since the udp header has been * already validated and pulled. */ u16 len; bool is_linear; bool csum_unnecessary; #endif }; static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) { return (struct udp_dev_scratch *)&skb->dev_scratch; } #if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return udp_skb_scratch(skb)->is_linear; } #else static inline unsigned int udp_skb_len(struct sk_buff *skb) { return skb->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return skb_csum_unnecessary(skb); } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return !skb_is_nonlinear(skb); } #endif static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { int n; n = copy_to_iter(skb->data + off, len, to); if (n == len) return 0; iov_iter_revert(to, n); return -EFAULT; } /* * SNMP statistics for UDP and UDP-Lite */ #define UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP6_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\ else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #define UDP6_INC_STATS(net, field, __lite) do { \ if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #if IS_ENABLED(CONFIG_IPV6) #define __UDPX_MIB(sk, ipv4) \ ({ \ ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics) : \ (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ sock_net(sk)->mib.udp_stats_in6); \ }) #else #define __UDPX_MIB(sk, ipv4) \ ({ \ IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics; \ }) #endif #define __UDPX_INC_STATS(sk, field) \ __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field) #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; struct udp_table *udp_table; }; struct udp_iter_state { struct seq_net_private p; int bucket; struct udp_seq_afinfo *bpf_seq_afinfo; }; void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); extern const struct seq_operations udp_seq_ops; extern const struct seq_operations udp6_seq_ops; int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ int udpv4_offload_init(void); void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); #endif static inline struct sk_buff *udp_rcv_segment(struct sock *sk, struct sk_buff *skb, bool ipv4) { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values */ if (!inet_get_convert_csum(sk)) features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. * Reset in this specific case, where PARTIAL is both correct and * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { int segs_nr = skb_shinfo(skb)->gso_segs; atomic_add(segs_nr, &sk->sk_drops); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr); kfree_skb(skb); return NULL; } consume_skb(skb); return segs; } #ifdef CONFIG_BPF_STREAM_PARSER struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); #endif /* BPF_STREAM_PARSER */ #endif /* _UDP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux NET3: Internet Group Management Protocol [IGMP] * * Authors: * Alan Cox <alan@lxorguk.ukuu.org.uk> * * Extended to talk the BSD extended IGMP protocol of mrouted 3.6 */ #ifndef _LINUX_IGMP_H #define _LINUX_IGMP_H #include <linux/skbuff.h> #include <linux/timer.h> #include <linux/in.h> #include <linux/ip.h> #include <linux/refcount.h> #include <uapi/linux/igmp.h> static inline struct igmphdr *igmp_hdr(const struct sk_buff *skb) { return (struct igmphdr *)skb_transport_header(skb); } static inline struct igmpv3_report * igmpv3_report_hdr(const struct sk_buff *skb) { return (struct igmpv3_report *)skb_transport_header(skb); } static inline struct igmpv3_query * igmpv3_query_hdr(const struct sk_buff *skb) { return (struct igmpv3_query *)skb_transport_header(skb); } struct ip_sf_socklist { unsigned int sl_max; unsigned int sl_count; struct rcu_head rcu; __be32 sl_addr[]; }; #define IP_SFLSIZE(count) (sizeof(struct ip_sf_socklist) + \ (count) * sizeof(__be32)) #define IP_SFBLOCK 10 /* allocate this many at once */ /* ip_mc_socklist is real list now. Speed is not argument; this list never used in fast path code */ struct ip_mc_socklist { struct ip_mc_socklist __rcu *next_rcu; struct ip_mreqn multi; unsigned int sfmode; /* MCAST_{INCLUDE,EXCLUDE} */ struct ip_sf_socklist __rcu *sflist; struct rcu_head rcu; }; struct ip_sf_list { struct ip_sf_list *sf_next; unsigned long sf_count[2]; /* include/exclude counts */ __be32 sf_inaddr; unsigned char sf_gsresp; /* include in g & s response? */ unsigned char sf_oldin; /* change state */ unsigned char sf_crcount; /* retrans. left to send */ }; struct ip_mc_list { struct in_device *interface; __be32 multiaddr; unsigned int sfmode; struct ip_sf_list *sources; struct ip_sf_list *tomb; unsigned long sfcount[2]; union { struct ip_mc_list *next; struct ip_mc_list __rcu *next_rcu; }; struct ip_mc_list __rcu *next_hash; struct timer_list timer; int users; refcount_t refcnt; spinlock_t lock; char tm_running; char reporter; char unsolicit_count; char loaded; unsigned char gsquery; /* check source marks? */ unsigned char crcount; struct rcu_head rcu; }; /* V3 exponential field decoding */ #define IGMPV3_MASK(value, nb) ((nb)>=32 ? (value) : ((1<<(nb))-1) & (value)) #define IGMPV3_EXP(thresh, nbmant, nbexp, value) \ ((value) < (thresh) ? (value) : \ ((IGMPV3_MASK(value, nbmant) | (1<<(nbmant))) << \ (IGMPV3_MASK((value) >> (nbmant), nbexp) + (nbexp)))) #define IGMPV3_QQIC(value) IGMPV3_EXP(0x80, 4, 3, value) #define IGMPV3_MRC(value) IGMPV3_EXP(0x80, 4, 3, value) static inline int ip_mc_may_pull(struct sk_buff *skb, unsigned int len) { if (skb_transport_offset(skb) + ip_transport_len(skb) < len) return 0; return pskb_may_pull(skb, len); } extern int ip_check_mc_rcu(struct in_device *dev, __be32 mc_addr, __be32 src_addr, u8 proto); extern int igmp_rcv(struct sk_buff *); extern int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr); extern int ip_mc_join_group_ssm(struct sock *sk, struct ip_mreqn *imr, unsigned int mode); extern int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr); extern void ip_mc_drop_socket(struct sock *sk); extern int ip_mc_source(int add, int omode, struct sock *sk, struct ip_mreq_source *mreqs, int ifindex); extern int ip_mc_msfilter(struct sock *sk, struct ip_msfilter *msf,int ifindex); extern int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct ip_msfilter __user *optval, int __user *optlen); extern int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage __user *p); extern int ip_mc_sf_allow(struct sock *sk, __be32 local, __be32 rmt, int dif, int sdif); extern void ip_mc_init_dev(struct in_device *); extern void ip_mc_destroy_dev(struct in_device *); extern void ip_mc_up(struct in_device *); extern void ip_mc_down(struct in_device *); extern void ip_mc_unmap(struct in_device *); extern void ip_mc_remap(struct in_device *); extern void __ip_mc_dec_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); static inline void ip_mc_dec_group(struct in_device *in_dev, __be32 addr) { return __ip_mc_dec_group(in_dev, addr, GFP_KERNEL); } extern void __ip_mc_inc_group(struct in_device *in_dev, __be32 addr, gfp_t gfp); extern void ip_mc_inc_group(struct in_device *in_dev, __be32 addr); int ip_mc_check_igmp(struct sk_buff *skb); #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 /* SPDX-License-Identifier: GPL-2.0 */ /* * workqueue.h --- work queue handling for Linux. */ #ifndef _LINUX_WORKQUEUE_H #define _LINUX_WORKQUEUE_H #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cpumask.h> #include <linux/rcupdate.h> struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); void delayed_work_timer_fn(struct timer_list *t); /* * The first word is the work queue pointer and the flags rolled into * one */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_DELAYED_BIT = 1, /* work item is delayed */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ #else WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ #endif WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, WORK_STRUCT_DELAYED = 1 << WORK_STRUCT_DELAYED_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, #else WORK_STRUCT_STATIC = 0, #endif /* * The last color is no color used for works which don't * participate in workqueue flushing. */ WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS) - 1, WORK_NO_COLOR = WORK_NR_COLORS, /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 15 workqueue * flush colors. */ WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last * pool it was on. Cap at 31 bits and use the highest number to * indicate that no pool is associated. */ WORK_OFFQ_FLAG_BITS = 1, WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, WORK_OFFQ_POOL_NONE = (1LU << WORK_OFFQ_POOL_BITS) - 1, /* convenience constants */ WORK_STRUCT_FLAG_MASK = (1UL << WORK_STRUCT_FLAG_BITS) - 1, WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK, WORK_STRUCT_NO_POOL = (unsigned long)WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ WORKER_DESC_LEN = 24, }; struct work_struct { atomic_long_t data; struct list_head entry; work_func_t func; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif }; #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) struct delayed_work { struct work_struct work; struct timer_list timer; /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; }; struct rcu_work { struct work_struct work; struct rcu_head rcu; /* target workqueue ->rcu uses to queue ->work */ struct workqueue_struct *wq; }; /** * struct workqueue_attrs - A struct for workqueue attributes. * * This can be used to change attributes of an unbound workqueue. */ struct workqueue_attrs { /** * @nice: nice level */ int nice; /** * @cpumask: allowed CPUs */ cpumask_var_t cpumask; /** * @no_numa: disable NUMA affinity * * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus * doesn't participate in pool hash calculations or equality comparisons. */ bool no_numa; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); } static inline struct rcu_work *to_rcu_work(struct work_struct *work) { return container_of(work, struct rcu_work, work); } struct execute_work { struct work_struct work; }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting _key * here is required, otherwise it could get initialised to the * copy of the lockdep_map! */ #define __WORK_INIT_LOCKDEP_MAP(n, k) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), #else #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\ (tflags) | TIMER_IRQSAFE), \ } #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0) #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; } #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif /* * initialize all of a work item in one go * * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ #ifdef CONFIG_LOCKDEP #define __INIT_WORK(_work, _func, _onstack) \ do { \ static struct lock_class_key __key; \ \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK(_work, _func, _onstack) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #endif #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ __init_timer(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __init_timer_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define INIT_DELAYED_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, 0) #define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0) #define INIT_DEFERRABLE_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE) #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) #define INIT_RCU_WORK(_work, _func) \ INIT_WORK(&(_work)->work, (_func)) #define INIT_RCU_WORK_ONSTACK(_work, _func) \ INIT_WORK_ONSTACK(&(_work)->work, (_func)) /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently * pending * @w: The work item in question */ #define delayed_work_pending(w) \ work_pending(&(w)->work) /* * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ enum { WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ /* * Per-cpu workqueues are generally preferred because they tend to * show better performance thanks to cache locality. Per-cpu * workqueues exclude the scheduler from choosing the CPU to * execute the worker threads, which has an unfortunate side effect * of increasing power consumption. * * The scheduler considers a CPU idle if it doesn't have any task * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to * excute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default * but become unbound if workqueue.power_efficient kernel param is * specified. Per-cpu workqueues which are identified to * contribute significantly to power-consumption are identified and * marked with this flag and enabling the power_efficient mode * leads to noticeable power saving at the cost of small * performance disadvantage. * * http://thread.gmane.org/gmane.linux.kernel/1480396 */ WQ_POWER_EFFICIENT = 1 << 7, __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, }; /* unbound wq's aren't per-cpu, scale max_active according to #cpus */ #define WQ_UNBOUND_MAX_ACTIVE \ max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU) /* * System-wide workqueues which are always present. * * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * * system_highpri_wq is similar to system_wq but for work items which * require WQ_HIGHPRI. * * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and * resources are available. * * system_freezable_wq is equivalent to system_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; /** * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * * Allocate a workqueue with the specified parameters. For detailed * information on WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @args...: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are * implemented as unbound workqueues with @max_active of one. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_workqueue_state(void); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. * * Memory-ordering properties: If it returns %true, guarantees that all stores * preceding the call to queue_work() in the program order will be visible from * the CPU which will execute @work by the time such work executes, e.g., * * { x is initially 0 } * * CPU0 CPU1 * * WRITE_ONCE(x, 1); [ @work is being executed ] * r0 = queue_work(wq, work); r1 = READ_ONCE(x); * * Forbids: r0 == true && r1 == 0 */ static inline bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { return queue_work_on(WORK_CPU_UNBOUND, wq, work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ static inline bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } /** * schedule_work - put work task in global workqueue * @work: job to be done * * Returns %false if @work was already on the kernel-global workqueue and * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the * DocBook header of queue_work(). */ static inline bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } /** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its * completion. * * Think twice before calling this function! It's very easy to get into * trouble if you don't take great care. Either of the following situations * will lead to deadlock: * * One of the work items currently on the workqueue needs to acquire * a lock held by your code or its caller. * * Your code is running in the context of a work routine. * * They will be detected by lockdep when they occur, but the first might not * occur very often. It depends on what work items are on the workqueue and * what locks they need, which you have no control over. * * In most situations flushing the entire workqueue is overkill; you merely * need to know that a particular work item isn't queued and isn't running. * In such cases you should use cancel_delayed_work_sync() or * cancel_work_sync() instead. */ static inline void flush_scheduled_work(void) { flush_workqueue(system_wq); } /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else long work_on_cpu(int cpu, long (*fn)(void *), void *arg); long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER extern void freeze_workqueues_begin(void); extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ #ifdef CONFIG_SYSFS int workqueue_sysfs_register(struct workqueue_struct *wq); #else /* CONFIG_SYSFS */ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ #ifdef CONFIG_WQ_WATCHDOG void wq_watchdog_touch(int cpu); #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ #ifdef CONFIG_SMP int workqueue_prepare_cpu(unsigned int cpu); int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif void __init workqueue_init_early(void); void __init workqueue_init(void); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * acpi_bus.h - ACPI Bus Driver ($Revision: 22 $) * * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> */ #ifndef __ACPI_BUS_H__ #define __ACPI_BUS_H__ #include <linux/device.h> #include <linux/property.h> /* TBD: Make dynamic */ #define ACPI_MAX_HANDLES 10 struct acpi_handle_list { u32 count; acpi_handle handles[ACPI_MAX_HANDLES]; }; /* acpi_utils.h */ acpi_status acpi_extract_package(union acpi_object *package, struct acpi_buffer *format, struct acpi_buffer *buffer); acpi_status acpi_evaluate_integer(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, unsigned long long *data); acpi_status acpi_evaluate_reference(acpi_handle handle, acpi_string pathname, struct acpi_object_list *arguments, struct acpi_handle_list *list); acpi_status acpi_evaluate_ost(acpi_handle handle, u32 source_event, u32 status_code, struct acpi_buffer *status_buf); acpi_status acpi_get_physical_device_location(acpi_handle handle, struct acpi_pld_info **pld); bool acpi_has_method(acpi_handle handle, char *name); acpi_status acpi_execute_simple_method(acpi_handle handle, char *method, u64 arg); acpi_status acpi_evaluate_ej0(acpi_handle handle); acpi_status acpi_evaluate_lck(acpi_handle handle, int lock); acpi_status acpi_evaluate_reg(acpi_handle handle, u8 space_id, u32 function); bool acpi_ata_match(acpi_handle handle); bool acpi_bay_match(acpi_handle handle); bool acpi_dock_match(acpi_handle handle); bool acpi_check_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 funcs); union acpi_object *acpi_evaluate_dsm(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4); static inline union acpi_object * acpi_evaluate_dsm_typed(acpi_handle handle, const guid_t *guid, u64 rev, u64 func, union acpi_object *argv4, acpi_object_type type) { union acpi_object *obj; obj = acpi_evaluate_dsm(handle, guid, rev, func, argv4); if (obj && obj->type != type) { ACPI_FREE(obj); obj = NULL; } return obj; } #define ACPI_INIT_DSM_ARGV4(cnt, eles) \ { \ .package.type = ACPI_TYPE_PACKAGE, \ .package.count = (cnt), \ .package.elements = (eles) \ } bool acpi_dev_found(const char *hid); bool acpi_dev_present(const char *hid, const char *uid, s64 hrv); #ifdef CONFIG_ACPI struct proc_dir_entry; #define ACPI_BUS_FILE_ROOT "acpi" extern struct proc_dir_entry *acpi_root_dir; enum acpi_bus_device_type { ACPI_BUS_TYPE_DEVICE = 0, ACPI_BUS_TYPE_POWER, ACPI_BUS_TYPE_PROCESSOR, ACPI_BUS_TYPE_THERMAL, ACPI_BUS_TYPE_POWER_BUTTON, ACPI_BUS_TYPE_SLEEP_BUTTON, ACPI_BUS_TYPE_ECDT_EC, ACPI_BUS_DEVICE_TYPE_COUNT }; struct acpi_driver; struct acpi_device; /* * ACPI Scan Handler * ----------------- */ struct acpi_hotplug_profile { struct kobject kobj; int (*scan_dependent)(struct acpi_device *adev); void (*notify_online)(struct acpi_device *adev); bool enabled:1; bool demand_offline:1; }; static inline struct acpi_hotplug_profile *to_acpi_hotplug_profile( struct kobject *kobj) { return container_of(kobj, struct acpi_hotplug_profile, kobj); } struct acpi_scan_handler { const struct acpi_device_id *ids; struct list_head list_node; bool (*match)(const char *idstr, const struct acpi_device_id **matchid); int (*attach)(struct acpi_device *dev, const struct acpi_device_id *id); void (*detach)(struct acpi_device *dev); void (*bind)(struct device *phys_dev); void (*unbind)(struct device *phys_dev); struct acpi_hotplug_profile hotplug; }; /* * ACPI Hotplug Context * -------------------- */ struct acpi_hotplug_context { struct acpi_device *self; int (*notify)(struct acpi_device *, u32); void (*uevent)(struct acpi_device *, u32); void (*fixup)(struct acpi_device *); }; /* * ACPI Driver * ----------- */ typedef int (*acpi_op_add) (struct acpi_device * device); typedef int (*acpi_op_remove) (struct acpi_device * device); typedef void (*acpi_op_notify) (struct acpi_device * device, u32 event); struct acpi_device_ops { acpi_op_add add; acpi_op_remove remove; acpi_op_notify notify; }; #define ACPI_DRIVER_ALL_NOTIFY_EVENTS 0x1 /* system AND device events */ struct acpi_driver { char name[80]; char class[80]; const struct acpi_device_id *ids; /* Supported Hardware IDs */ unsigned int flags; struct acpi_device_ops ops; struct device_driver drv; struct module *owner; }; /* * ACPI Device * ----------- */ /* Status (_STA) */ struct acpi_device_status { u32 present:1; u32 enabled:1; u32 show_in_ui:1; u32 functional:1; u32 battery_present:1; u32 reserved:27; }; /* Flags */ struct acpi_device_flags { u32 dynamic_status:1; u32 removable:1; u32 ejectable:1; u32 power_manageable:1; u32 match_driver:1; u32 initialized:1; u32 visited:1; u32 hotplug_notify:1; u32 is_dock_station:1; u32 of_compatible_ok:1; u32 coherent_dma:1; u32 cca_seen:1; u32 enumeration_by_parent:1; u32 reserved:19; }; /* File System */ struct acpi_device_dir { struct proc_dir_entry *entry; }; #define acpi_device_dir(d) ((d)->dir.entry) /* Plug and Play */ typedef char acpi_bus_id[8]; typedef u64 acpi_bus_address; typedef char acpi_device_name[40]; typedef char acpi_device_class[20]; struct acpi_hardware_id { struct list_head list; const char *id; }; struct acpi_pnp_type { u32 hardware_id:1; u32 bus_address:1; u32 platform_id:1; u32 reserved:29; }; struct acpi_device_pnp { acpi_bus_id bus_id; /* Object name */ int instance_no; /* Instance number of this object */ struct acpi_pnp_type type; /* ID type */ acpi_bus_address bus_address; /* _ADR */ char *unique_id; /* _UID */ struct list_head ids; /* _HID and _CIDs */ acpi_device_name device_name; /* Driver-determined */ acpi_device_class device_class; /* " */ union acpi_object *str_obj; /* unicode string for _STR method */ }; #define acpi_device_bid(d) ((d)->pnp.bus_id) #define acpi_device_adr(d) ((d)->pnp.bus_address) const char *acpi_device_hid(struct acpi_device *device); #define acpi_device_uid(d) ((d)->pnp.unique_id) #define acpi_device_name(d) ((d)->pnp.device_name) #define acpi_device_class(d) ((d)->pnp.device_class) /* Power Management */ struct acpi_device_power_flags { u32 explicit_get:1; /* _PSC present? */ u32 power_resources:1; /* Power resources */ u32 inrush_current:1; /* Serialize Dx->D0 */ u32 power_removed:1; /* Optimize Dx->D0 */ u32 ignore_parent:1; /* Power is independent of parent power state */ u32 dsw_present:1; /* _DSW present? */ u32 reserved:26; }; struct acpi_device_power_state { struct { u8 valid:1; u8 explicit_set:1; /* _PSx present? */ u8 reserved:6; } flags; int power; /* % Power (compared to D0) */ int latency; /* Dx->D0 time (microseconds) */ struct list_head resources; /* Power resources referenced */ }; struct acpi_device_power { int state; /* Current state */ struct acpi_device_power_flags flags; struct acpi_device_power_state states[ACPI_D_STATE_COUNT]; /* Power states (D0-D3Cold) */ }; /* Performance Management */ struct acpi_device_perf_flags { u8 reserved:8; }; struct acpi_device_perf_state { struct { u8 valid:1; u8 reserved:7; } flags; u8 power; /* % Power (compared to P0) */ u8 performance; /* % Performance ( " ) */ int latency; /* Px->P0 time (microseconds) */ }; struct acpi_device_perf { int state; struct acpi_device_perf_flags flags; int state_count; struct acpi_device_perf_state *states; }; /* Wakeup Management */ struct acpi_device_wakeup_flags { u8 valid:1; /* Can successfully enable wakeup? */ u8 notifier_present:1; /* Wake-up notify handler has been installed */ }; struct acpi_device_wakeup_context { void (*func)(struct acpi_device_wakeup_context *context); struct device *dev; }; struct acpi_device_wakeup { acpi_handle gpe_device; u64 gpe_number; u64 sleep_state; struct list_head resources; struct acpi_device_wakeup_flags flags; struct acpi_device_wakeup_context context; struct wakeup_source *ws; int prepare_count; int enable_count; }; struct acpi_device_physical_node { unsigned int node_id; struct list_head node; struct device *dev; bool put_online:1; }; struct acpi_device_properties { const guid_t *guid; const union acpi_object *properties; struct list_head list; }; /* ACPI Device Specific Data (_DSD) */ struct acpi_device_data { const union acpi_object *pointer; struct list_head properties; const union acpi_object *of_compatible; struct list_head subnodes; }; struct acpi_gpio_mapping; /* Device */ struct acpi_device { int device_type; acpi_handle handle; /* no handle for fixed hardware */ struct fwnode_handle fwnode; struct acpi_device *parent; struct list_head children; struct list_head node; struct list_head wakeup_list; struct list_head del_list; struct acpi_device_status status; struct acpi_device_flags flags; struct acpi_device_pnp pnp; struct acpi_device_power power; struct acpi_device_wakeup wakeup; struct acpi_device_perf performance; struct acpi_device_dir dir; struct acpi_device_data data; struct acpi_scan_handler *handler; struct acpi_hotplug_context *hp; struct acpi_driver *driver; const struct acpi_gpio_mapping *driver_gpios; void *driver_data; struct device dev; unsigned int physical_node_count; unsigned int dep_unmet; struct list_head physical_node_list; struct mutex physical_node_lock; void (*remove)(struct acpi_device *); }; /* Non-device subnode */ struct acpi_data_node { const char *name; acpi_handle handle; struct fwnode_handle fwnode; struct fwnode_handle *parent; struct acpi_device_data data; struct list_head sibling; struct kobject kobj; struct completion kobj_done; }; extern const struct fwnode_operations acpi_device_fwnode_ops; extern const struct fwnode_operations acpi_data_fwnode_ops; extern const struct fwnode_operations acpi_static_fwnode_ops; bool is_acpi_device_node(const struct fwnode_handle *fwnode); bool is_acpi_data_node(const struct fwnode_handle *fwnode); static inline bool is_acpi_node(const struct fwnode_handle *fwnode) { return (is_acpi_device_node(fwnode) || is_acpi_data_node(fwnode)); } #define to_acpi_device_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_device_node_fwnode = __fwnode; \ \ is_acpi_device_node(__to_acpi_device_node_fwnode) ? \ container_of(__to_acpi_device_node_fwnode, \ struct acpi_device, fwnode) : \ NULL; \ }) #define to_acpi_data_node(__fwnode) \ ({ \ typeof(__fwnode) __to_acpi_data_node_fwnode = __fwnode; \ \ is_acpi_data_node(__to_acpi_data_node_fwnode) ? \ container_of(__to_acpi_data_node_fwnode, \ struct acpi_data_node, fwnode) : \ NULL; \ }) static inline bool is_acpi_static_node(const struct fwnode_handle *fwnode) { return !IS_ERR_OR_NULL(fwnode) && fwnode->ops == &acpi_static_fwnode_ops; } static inline bool acpi_data_node_match(cons