1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/block_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE */ #include <linux/init.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/uaccess.h> #include <linux/suspend.h> #include "internal.h" struct bdev_inode { struct block_device bdev; struct inode vfs_inode; }; static const struct address_space_operations def_blk_aops; static inline struct bdev_inode *BDEV_I(struct inode *inode) { return container_of(inode, struct bdev_inode, vfs_inode); } struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } EXPORT_SYMBOL(I_BDEV); static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; int ret; spin_lock(&inode->i_lock); while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); ret = write_inode_now(inode, true); if (ret) { char name[BDEVNAME_SIZE]; pr_warn_ratelimited("VFS: Dirty inode writeback failed " "for block device %s (err=%d).\n", bdevname(bdev, name), ret); } spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); } /* Kill _all_ buffers and pagecache , dirty or not.. */ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); } /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; if (mapping->nrpages) { invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } /* 99% of the time, we don't need to flush the cleancache on the bdev. * But, for the strange corners, lets be cautious */ cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); /* * Drop all buffers & page cache for given bdev range. This function bails * with error if bdev has other exclusive owner (such as filesystem). */ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend) { struct block_device *claimed_bdev = NULL; int err; /* * If we don't hold exclusive handle for the device, upgrade to it * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ if (!(mode & FMODE_EXCL)) { claimed_bdev = bdev->bd_contains; err = bd_prepare_to_claim(bdev, claimed_bdev, truncate_bdev_range); if (err) goto invalidate; } truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); if (claimed_bdev) bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); return 0; invalidate: /* * Someone else has handle exclusively open. Try invalidating instead. * The 'end' argument is inclusive so the rounding is safe. */ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, lstart >> PAGE_SHIFT, lend >> PAGE_SHIFT); } EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { unsigned int bsize = bdev_logical_block_size(bdev); loff_t size = i_size_read(bdev->bd_inode); while (bsize < PAGE_SIZE) { if (size & bsize) break; bsize <<= 1; } bdev->bd_inode->i_blkbits = blksize_bits(bsize); } int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ if (size < bdev_logical_block_size(bdev)) return -EINVAL; /* Don't change the size if it is same as current */ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { sync_blockdev(bdev); bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } return 0; } EXPORT_SYMBOL(set_blocksize); int sb_set_blocksize(struct super_block *sb, int size) { if (set_blocksize(sb->s_bdev, size)) return 0; /* If we get here, we know size is power of two * and it's value is between 512 and PAGE_SIZE */ sb->s_blocksize = size; sb->s_blocksize_bits = blksize_bits(size); return sb->s_blocksize; } EXPORT_SYMBOL(sb_set_blocksize); int sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) size = minsize; return sb_set_blocksize(sb, size); } EXPORT_SYMBOL(sb_min_blocksize); static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } static struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } static unsigned int dio_bio_write_op(struct kiocb *iocb) { unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ if (iocb->ki_flags & IOCB_DSYNC) op |= REQ_FUA; return op; } #define DIO_INLINE_BIO_VECS 4 static void blkdev_bio_end_io_simple(struct bio *bio) { struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); blk_wake_io_task(waiter); } static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(bdev_file_inode(file)); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; blk_qc_t qc; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); if (!vecs) return -ENOMEM; } bio_init(&bio, vecs, nr_pages); bio_set_dev(&bio, bdev); bio.bi_iter.bi_sector = pos >> 9; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == READ) { bio.bi_opf = REQ_OP_READ; if (iter_is_iovec(iter)) should_dirty = true; } else { bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); qc = submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); out: if (vecs != inline_vecs) kfree(vecs); bio_uninit(&bio); return ret; } struct blkdev_dio { union { struct kiocb *iocb; struct task_struct *waiter; }; size_t size; atomic_t ref; bool multi_bio : 1; bool should_dirty : 1; bool is_sync : 1; struct bio bio; }; static struct bio_set blkdev_dio_pool; static int blkdev_iopoll(struct kiocb *kiocb, bool wait) { struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); struct request_queue *q = bdev_get_queue(bdev); return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); } static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); if (dio->multi_bio) bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); blk_wake_io_task(waiter); } } if (should_dirty) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(inode); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); if (dio->is_sync) { dio->waiter = current; bio_get(bio); } else { dio->iocb = iocb; } dio->size = 0; dio->multi_bio = false; dio->should_dirty = is_read && iter_is_iovec(iter); /* * Don't plug for HIPRI/polled IO, as those should go straight * to issue */ if (!is_poll) blk_start_plug(&plug); for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } if (is_read) { bio->bi_opf = REQ_OP_READ; if (dio->should_dirty) bio_set_pages_dirty(bio); } else { bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { bool polled = false; if (iocb->ki_flags & IOCB_HIPRI) { bio_set_polled(bio, iocb); polled = true; } qc = submit_bio(bio); if (polled) WRITE_ONCE(iocb->ki_cookie, qc); break; } if (!dio->multi_bio) { /* * AIO needs an extra reference to ensure the dio * structure which is embedded into the first bio * stays around. */ if (!is_sync) bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { atomic_inc(&dio->ref); } submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } if (!is_poll) blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; bio_put(&dio->bio); return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { int nr_pages; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); if (!nr_pages) return 0; if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); } static __init int blkdev_init(void) { return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); } module_init(blkdev_init); int __sync_blockdev(struct block_device *bdev, int wait) { if (!bdev) return 0; if (!wait) return filemap_flush(bdev->bd_inode->i_mapping); return filemap_write_and_wait(bdev->bd_inode->i_mapping); } /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { return __sync_blockdev(bdev, 1); } EXPORT_SYMBOL(sync_blockdev); /* * Write out and wait upon all dirty data associated with this * device. Filesystem data as well as the underlying block * device. Takes the superblock lock. */ int fsync_bdev(struct block_device *bdev) { struct super_block *sb = get_super(bdev); if (sb) { int res = sync_filesystem(sb); drop_super(sb); return res; } return sync_blockdev(bdev); } EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev -- lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple * freeze requests arrive simultaneously. It counts up in freeze_bdev() and * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze * actually. */ struct super_block *freeze_bdev(struct block_device *bdev) { struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); if (++bdev->bd_fsfreeze_count > 1) { /* * We don't even need to grab a reference - the first call * to freeze_bdev grab an active reference and only the last * thaw_bdev drops it. */ sb = get_super(bdev); if (sb) drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } sb = get_active_super(bdev); if (!sb) goto out; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb); else error = freeze_super(sb); if (error) { deactivate_super(sb); bdev->bd_fsfreeze_count--; mutex_unlock(&bdev->bd_fsfreeze_mutex); return ERR_PTR(error); } deactivate_super(sb); out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; /* thaw_bdev releases s->s_umount */ } EXPORT_SYMBOL(freeze_bdev); /** * thaw_bdev -- unlock filesystem * @bdev: blockdevice to unlock * @sb: associated superblock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ int thaw_bdev(struct block_device *bdev, struct super_block *sb) { int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); if (!bdev->bd_fsfreeze_count) goto out; error = 0; if (--bdev->bd_fsfreeze_count > 0) goto out; if (!sb) goto out; if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb); else error = thaw_super(sb); if (error) bdev->bd_fsfreeze_count++; out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(thaw_bdev); static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); } static int blkdev_readpage(struct file * file, struct page * page) { return block_read_full_page(page, blkdev_get_block); } static void blkdev_readahead(struct readahead_control *rac) { mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { return block_write_begin(mapping, pos, len, flags, pagep, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { int ret; ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); unlock_page(page); put_page(page); return ret; } /* * private llseek: * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = bdev_file_inode(file); loff_t retval; inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); inode_unlock(bd_inode); return retval; } int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; error = file_write_and_wait_range(filp, start, end); if (error) return error; /* * There is no need to serialise calls to blkdev_issue_flush with * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ error = blkdev_issue_flush(bdev, GFP_KERNEL); if (error == -EOPNOTSUPP) error = 0; return error; } EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device * @bdev: The device to read the page from * @sector: The offset on the device to read the page to (need not be aligned) * @page: The page to read * * On entry, the page should be locked. It will be unlocked when the page * has been read. If the block driver implements rw_page synchronously, * that will be true on exit from this function, but it need not be. * * Errors returned by this function are usually "soft", eg out of memory, or * queue full; callers should try a different route to read this page rather * than propagate an error back up the stack. * * Return: negative errno if an error occurs, 0 if submission was successful. */ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; int result = -EOPNOTSUPP; if (!ops->rw_page || bdev_get_integrity(bdev)) return result; result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_READ); blk_queue_exit(bdev->bd_disk->queue); return result; } /** * bdev_write_page() - Start writing a page to a block device * @bdev: The device to write the page to * @sector: The offset on the device to write the page to (need not be aligned) * @page: The page to write * @wbc: The writeback_control for the write * * On entry, the page should be locked and not currently under writeback. * On exit, if the write started successfully, the page will be unlocked and * under writeback. If the write failed already (eg the driver failed to * queue the page to the device), the page will still be locked. If the * caller is a ->writepage implementation, it will need to unlock the page. * * Errors returned by this function are usually "soft", eg out of memory, or * queue full; callers should try a different route to write this page rather * than propagate an error back up the stack. * * Return: negative errno if an error occurs, 0 if submission was successful. */ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_WRITE); if (result) { end_page_writeback(page); } else { clean_page_buffers(page); unlock_page(page); } blk_queue_exit(bdev->bd_disk->queue); return result; } /* * pseudo-fs */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) { struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void bdev_free_inode(struct inode *inode) { kmem_cache_free(bdev_cachep, BDEV_I(inode)); } static void init_once(void *foo) { struct bdev_inode *ei = (struct bdev_inode *) foo; struct block_device *bdev = &ei->bdev; memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); } static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); /* Detach inode from wb early as bdi_put() may free bdi->wb */ inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); bdev->bd_bdi = &noop_backing_dev_info; } } static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, .drop_inode = generic_delete_inode, .evict_inode = bdev_evict_inode, }; static int bd_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_CGROUPWB; ctx->ops = &bdev_sops; return 0; } static struct file_system_type bd_type = { .name = "bdev", .init_fs_context = bd_init_fs_context, .kill_sb = kill_anon_super, }; struct super_block *blockdev_superblock __read_mostly; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; static struct vfsmount *bd_mnt; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } /* * Most likely _very_ bad one - but then it's hardly critical for small * /dev and can be fixed when somebody will need really large one. * Keep in mind that it will be fed through icache hash function too. */ static inline unsigned long hash(dev_t dev) { return MAJOR(dev)+MINOR(dev); } static int bdev_test(struct inode *inode, void *data) { return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; } static int bdev_set(struct inode *inode, void *data) { BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; return 0; } static struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; inode = iget5_locked(blockdev_superblock, hash(dev), bdev_test, bdev_set, &dev); if (!inode) return NULL; bdev = &BDEV_I(inode)->bdev; if (inode->i_state & I_NEW) { spin_lock_init(&bdev->bd_size_lock); bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_part_count = 0; inode->i_mode = S_IFBLK; inode->i_rdev = dev; inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); unlock_new_inode(inode); } return bdev; } /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. */ struct block_device *bdgrab(struct block_device *bdev) { ihold(bdev->bd_inode); return bdev; } EXPORT_SYMBOL(bdgrab); struct block_device *bdget_part(struct hd_struct *part) { return bdget(part_devt(part)); } long nr_blockdev_pages(void) { struct inode *inode; long ret = 0; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) ret += inode->i_mapping->nrpages; spin_unlock(&blockdev_superblock->s_inode_list_lock); return ret; } void bdput(struct block_device *bdev) { iput(bdev->bd_inode); } EXPORT_SYMBOL(bdput); static struct block_device *bd_acquire(struct inode *inode) { struct block_device *bdev; spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev && !inode_unhashed(bdev->bd_inode)) { bdgrab(bdev); spin_unlock(&bdev_lock); return bdev; } spin_unlock(&bdev_lock); /* * i_bdev references block device inode that was already shut down * (corresponding device got removed). Remove the reference and look * up block device inode again just in case new device got * reestablished under the same device number. */ if (bdev) bd_forget(inode); bdev = bdget(inode->i_rdev); if (bdev) { spin_lock(&bdev_lock); if (!inode->i_bdev) { /* * We take an additional reference to bd_inode, * and it's released in clear_inode() of inode. * So, we can access it via ->i_mapping always * without igrab(). */ bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; } spin_unlock(&bdev_lock); } return bdev; } /* Call when you free inode */ void bd_forget(struct inode *inode) { struct block_device *bdev = NULL; spin_lock(&bdev_lock); if (!sb_is_blkdev_sb(inode->i_sb)) bdev = inode->i_bdev; inode->i_bdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&bdev_lock); if (bdev) bdput(bdev); } /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest * @whole: whole block device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * * Test whether @bdev can be claimed by @holder. * * CONTEXT: * spin_lock(&bdev_lock). * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, void *holder) { if (bdev->bd_holder == holder) return true; /* already a holder */ else if (bdev->bd_holder != NULL) return false; /* held by someone else */ else if (whole == bdev) return true; /* is a whole device which isn't held */ else if (whole->bd_holder == bd_may_claim) return true; /* is a partition of a device that is being partitioned */ else if (whole->bd_holder != NULL) return false; /* is a partition of a held device */ else return true; /* is a partition of an un-held device */ } /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * * Claim @bdev. This function fails if @bdev is already claimed by another * holder and waits if another claiming is in progress. return, the caller * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, void *holder) { retry: spin_lock(&bdev_lock); /* if someone else claimed, fail */ if (!bd_may_claim(bdev, whole, holder)) { spin_unlock(&bdev_lock); return -EBUSY; } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); goto retry; } /* yay, all mine */ whole->bd_claiming = holder; spin_unlock(&bdev_lock); return 0; } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) { struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); if (!disk) return NULL; /* * Now that we hold gendisk reference we make sure bdev we looked up is * not stale. If it is, it means device got removed and created before * we looked up gendisk and we fail open in such case. Associating * unhashed bdev with newly created gendisk could lead to two bdevs * (and thus two independent caches) being associated with one device * which is bad. */ if (inode_unhashed(bdev->bd_inode)) { put_disk_and_module(disk); return NULL; } return disk; } static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); /* tell others that we're done */ BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL; wake_up_bit(&whole->bd_claiming, 0); } /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest * @whole: whole block device * @holder: holder that has claimed @bdev * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ static void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, void *holder) { spin_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, whole, holder)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder */ whole->bd_holders++; whole->bd_holder = bd_may_claim; bdev->bd_holders++; bdev->bd_holder = holder; bd_clear_claiming(whole, holder); spin_unlock(&bdev_lock); } /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest * @whole: whole block device * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be * also used when exclusive open is not actually desired and we just needed * to block other exclusive openers for a while. */ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, void *holder) { spin_lock(&bdev_lock); bd_clear_claiming(whole, holder); spin_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list; struct gendisk *disk; int refcnt; }; static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; list_for_each_entry(holder, &bdev->bd_holder_disks, list) if (holder->disk == disk) return holder; return NULL; } static int add_symlink(struct kobject *from, struct kobject *to) { return sysfs_create_link(from, to, kobject_name(to)); } static void del_symlink(struct kobject *from, struct kobject *to) { sysfs_remove_link(from, kobject_name(to)); } /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev * @disk: the holding disk * * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. * * This functions creates the following sysfs symlinks. * * - from "slaves" directory of the holder @disk to the claimed @bdev * - from "holders" directory of the @bdev to the holder @disk * * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is * passed to bd_link_disk_holder(), then: * * /sys/block/dm-0/slaves/sda --> /sys/block/sda * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 * * The caller must have claimed @bdev before calling this function and * ensure that both @bdev and @disk are valid during the creation and * lifetime of these symlinks. * * CONTEXT: * Might sleep. * * RETURNS: * 0 on success, -errno on failure. */ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; int ret = 0; mutex_lock(&bdev->bd_mutex); WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); if (holder) { holder->refcnt++; goto out_unlock; } holder = kzalloc(sizeof(*holder), GFP_KERNEL); if (!holder) { ret = -ENOMEM; goto out_unlock; } INIT_LIST_HEAD(&holder->list); holder->disk = disk; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); if (ret) goto out_free; ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; /* * bdev could be deleted beneath us which would implicitly destroy * the holder directory. Hold on to it. */ kobject_get(bdev->bd_part->holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; out_del: del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); out_free: kfree(holder); out_unlock: mutex_unlock(&bdev->bd_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev * @disk: the holding disk * * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. * * CONTEXT: * Might sleep. */ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; mutex_lock(&bdev->bd_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); kobject_put(bdev->bd_part->holder_dir); list_del_init(&holder->list); kfree(holder); } mutex_unlock(&bdev->bd_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif /** * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ static void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, bool verbose) { loff_t disk_size, bdev_size; spin_lock(&bdev->bd_size_lock); disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { if (verbose) { printk(KERN_INFO "%s: detected capacity change from %lld to %lld\n", disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); } spin_unlock(&bdev->bd_size_lock); if (bdev_size > disk_size) { if (__invalidate_device(bdev, false)) pr_warn("VFS: busy inodes on resized disk %s\n", disk->disk_name); } } /** * revalidate_disk_size - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ void revalidate_disk_size(struct gendisk *disk, bool verbose) { struct block_device *bdev; /* * Hidden disks don't have associated bdev so there's no point in * revalidating them. */ if (disk->flags & GENHD_FL_HIDDEN) return; bdev = bdget_disk(disk, 0); if (bdev) { check_disk_size_change(disk, bdev, verbose); bdput(bdev); } } EXPORT_SYMBOL(revalidate_disk_size); void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); spin_unlock(&bdev->bd_size_lock); } EXPORT_SYMBOL(bd_set_nr_sectors); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) { struct gendisk *disk = bdev->bd_disk; int ret; lockdep_assert_held(&bdev->bd_mutex); if (!(disk->flags & GENHD_FL_UP)) return -ENXIO; rescan: ret = blk_drop_partitions(bdev); if (ret) return ret; clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). * Doing that is rather inconsistent, but changing it broke legacy * udisks polling for legacy ide-cdrom devices. Use the crude check * below to get the sane behavior for most device while not breaking * userspace for this particular setup. */ if (invalidate) { if (disk_part_scan_enabled(disk) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } else { if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); } check_disk_size_change(disk, bdev, !invalidate); if (get_capacity(disk)) { ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); } return ret; } /* * Only exported for for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); /* * bd_mutex locking: * * mutex_lock(part->bd_mutex) * mutex_lock_nested(whole->bd_mutex, 1) */ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, int for_part) { struct block_device *whole = NULL, *claiming = NULL; struct gendisk *disk; int ret; int partno; bool first_open = false, unblock_events = true, need_restart; restart: need_restart = false; ret = -ENXIO; disk = bdev_get_gendisk(bdev, &partno); if (!disk) goto out; if (partno) { whole = bdget_disk(disk, 0); if (!whole) { ret = -ENOMEM; goto out_put_disk; } } if (!for_part && (mode & FMODE_EXCL)) { WARN_ON_ONCE(!holder); if (whole) claiming = whole; else claiming = bdev; ret = bd_prepare_to_claim(bdev, claiming, holder); if (ret) goto out_put_whole; } disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { first_open = true; bdev->bd_disk = disk; bdev->bd_contains = bdev; bdev->bd_partno = partno; if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); if (!bdev->bd_part) goto out_clear; ret = 0; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); /* * If we lost a race with 'disk' being deleted, * try again. See md.c */ if (ret == -ERESTARTSYS) need_restart = true; } if (!ret) { bd_set_nr_sectors(bdev, get_capacity(disk)); set_init_blocksize(bdev); } /* * If the device is invalidated, rescan partition * if open succeeded or failed with -ENOMEDIUM. * The latter is necessary to prevent ghost * partitions on a removed medium. */ if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_clear; } else { BUG_ON(for_part); ret = __blkdev_get(whole, mode, NULL, 1); if (ret) goto out_clear; bdev->bd_contains = bdgrab(whole); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { ret = -ENXIO; goto out_clear; } bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (bdev->bd_contains == bdev) { ret = 0; if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_unlock_bdev; } } bdev->bd_openers++; if (for_part) bdev->bd_part_count++; if (claiming) bd_finish_claiming(bdev, claiming, holder); /* * Block event polling for write claims if requested. Any write holder * makes the write_holder state stick until all are released. This is * good enough and tracking individual writeable reference is too * fragile given the way @mode is used in blkdev_get/put(). */ if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder && (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; } mutex_unlock(&bdev->bd_mutex); if (unblock_events) disk_unblock_events(disk); /* only one opener holds refs to the module and disk */ if (!first_open) put_disk_and_module(disk); if (whole) bdput(whole); return 0; out_clear: disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; out_unlock_bdev: if (claiming) bd_abort_claiming(bdev, claiming, holder); mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); out_put_whole: if (whole) bdput(whole); out_put_disk: put_disk_and_module(disk); if (need_restart) goto restart; out: return ret; } /** * blkdev_get - open a block device * @bdev: block_device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is * open with exclusive access. Specifying %FMODE_EXCL with %NULL * @holder is invalid. Exclusive opens may nest for the same @holder. * * On success, the reference count of @bdev is unchanged. On failure, * @bdev is put. * * CONTEXT: * Might sleep. * * RETURNS: * 0 on success, -errno on failure. */ static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { int ret, perm = 0; if (mode & FMODE_READ) perm |= MAY_READ; if (mode & FMODE_WRITE) perm |= MAY_WRITE; ret = devcgroup_inode_permission(bdev->bd_inode, perm); if (ret) goto bdput; ret =__blkdev_get(bdev, mode, holder, 0); if (ret) goto bdput; return 0; bdput: bdput(bdev); return ret; } /** * blkdev_get_by_path - open a block device by name * @path: path to the block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * * Open the blockdevice described by the device file at @path. @mode * and @holder are identical to blkdev_get(). * * On success, the returned block_device has reference count of one. * * CONTEXT: * Might sleep. * * RETURNS: * Pointer to block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder) { struct block_device *bdev; int err; bdev = lookup_bdev(path); if (IS_ERR(bdev)) return bdev; err = blkdev_get(bdev, mode, holder); if (err) return ERR_PTR(err); if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, mode); return ERR_PTR(-EACCES); } return bdev; } EXPORT_SYMBOL(blkdev_get_by_path); /** * blkdev_get_by_dev - open a block device by device number * @dev: device number of block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * * Open the blockdevice described by device number @dev. @mode and * @holder are identical to blkdev_get(). * * Use it ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a * device number. _Never_ to be used for internal purposes. If you * ever need it - reconsider your API. * * On success, the returned block_device has reference count of one. * * CONTEXT: * Might sleep. * * RETURNS: * Pointer to block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) { struct block_device *bdev; int err; bdev = bdget(dev); if (!bdev) return ERR_PTR(-ENOMEM); err = blkdev_get(bdev, mode, holder); if (err) return ERR_PTR(err); return bdev; } EXPORT_SYMBOL(blkdev_get_by_dev); static int blkdev_open(struct inode * inode, struct file * filp) { struct block_device *bdev; /* * Preserve backwards compatibility and allow large file access * even if userspace doesn't ask for it explicitly. Some mkfs * binary needs it. We might want to drop this workaround * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; if (filp->f_flags & O_EXCL) filp->f_mode |= FMODE_EXCL; if ((filp->f_flags & O_ACCMODE) == 3) filp->f_mode |= FMODE_WRITE_IOCTL; bdev = bd_acquire(inode); if (bdev == NULL) return -ENOMEM; filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return blkdev_get(bdev, filp->f_mode, filp); } static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; /* * Sync early if it looks like we're the last one. If someone else * opens the block device between now and the decrement of bd_openers * then we did a sync that we didn't need to, but that's not the end * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ if (bdev->bd_openers == 1) sync_blockdev(bdev); mutex_lock_nested(&bdev->bd_mutex, for_part); if (for_part) bdev->bd_part_count--; if (!--bdev->bd_openers) { WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); bdev_write_inode(bdev); } if (bdev->bd_contains == bdev) { if (disk->fops->release) disk->fops->release(disk, mode); } if (!bdev->bd_openers) { disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; put_disk_and_module(disk); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); if (victim) __blkdev_put(victim, mode, 1); } void blkdev_put(struct block_device *bdev, fmode_t mode) { mutex_lock(&bdev->bd_mutex); if (mode & FMODE_EXCL) { bool bdev_free; /* * Release a claim on the device. The holder fields * are protected with bdev_lock. bd_mutex is to * synchronize disk_holder unlinking. */ spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); /* bd_contains might point to self, check in a separate step */ if ((bdev_free = !bdev->bd_holders)) bdev->bd_holder = NULL; if (!bdev->bd_contains->bd_holders) bdev->bd_contains->bd_holder = NULL; spin_unlock(&bdev_lock); /* * If this was the last claim, remove holder link and * unblock evpoll if it was a write holder. */ if (bdev_free && bdev->bd_write_holder) { disk_unblock_events(bdev->bd_disk); bdev->bd_write_holder = false; } } /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE * event. This is to ensure detection of media removal commanded * from userland - e.g. eject(1). */ disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have * to updated it before every ioctl. */ if (file->f_flags & O_NDELAY) mode |= FMODE_NDELAY; else mode &= ~FMODE_NDELAY; return blkdev_ioctl(bdev, mode, cmd, arg); } /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. * * Does not take i_mutex for the write and thus is not for general purpose * use. */ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; size_t shorted = 0; ssize_t ret; if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) return 0; if (iocb->ki_pos >= size) return -ENOSPC; if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; size -= iocb->ki_pos; if (iov_iter_count(from) > size) { shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret; if (pos >= size) return 0; size -= pos; if (iov_iter_count(to) > size) { shorted = iov_iter_count(to) - size; iov_iter_truncate(to, size); } ret = generic_file_read_iter(iocb, to); iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } EXPORT_SYMBOL_GPL(blkdev_read_iter); /* * Try to release a page associated with block device when the system * is under memory pressure. */ static int blkdev_releasepage(struct page *page, gfp_t wait) { struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; if (super && super->s_op->bdev_try_to_free_page) return super->s_op->bdev_try_to_free_page(super, page, wait); return try_to_free_buffers(page); } static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { return generic_writepages(mapping, wbc); } static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .migratepage = buffer_migrate_page_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); loff_t end = start + len - 1; loff_t isize; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = i_size_read(bdev->bd_inode); if (start >= isize) return -EINVAL; if (end >= isize) { if (mode & FALLOC_FL_KEEP_SIZE) { len = isize - start; end = start + len - 1; } else return -EINVAL; } /* * Don't allow IO that isn't aligned to logical block size. */ if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; /* Invalidate the page cache, including dirty pages. */ error = truncate_bdev_range(bdev, file->f_mode, start, end); if (error) return error; switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: error = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); break; default: return -EOPNOTSUPP; } if (error) return error; /* * Invalidate again; if someone wandered in and dirtied a page, * the caller will be given -EBUSY. The third argument is * inclusive, so the rounding here is safe. */ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); } const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = blkdev_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, }; /** * lookup_bdev - lookup a struct block_device by name * @pathname: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ struct block_device *lookup_bdev(const char *pathname) { struct block_device *bdev; struct inode *inode; struct path path; int error; if (!pathname || !*pathname) return ERR_PTR(-EINVAL); error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) return ERR_PTR(error); inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; error = -EACCES; if (!may_open_dev(&path)) goto fail; error = -ENOMEM; bdev = bd_acquire(inode); if (!bdev) goto fail; out: path_put(&path); return bdev; fail: bdev = ERR_PTR(error); goto out; } EXPORT_SYMBOL(lookup_bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty) { struct super_block *sb = get_super(bdev); int res = 0; if (sb) { /* * no need to lock the super, get_super holds the * read mutex so the filesystem cannot go away * under us (->put_super runs with the write lock * hold). */ shrink_dcache_sb(sb); res = invalidate_inodes(sb, kill_dirty); drop_super(sb); } invalidate_bdev(bdev); return res; } EXPORT_SYMBOL(__invalidate_device); void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || mapping->nrpages == 0) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); old_inode = inode; bdev = I_BDEV(inode); mutex_lock(&bdev->bd_mutex); if (bdev->bd_openers) func(bdev, arg); mutex_unlock(&bdev->bd_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); }
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM writeback #if !defined(_TRACE_WRITEBACK_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_WRITEBACK_H #include <linux/tracepoint.h> #include <linux/backing-dev.h> #include <linux/writeback.h> #define show_inode_state(state) \ __print_flags(state, "|", \ {I_DIRTY_SYNC, "I_DIRTY_SYNC"}, \ {I_DIRTY_DATASYNC, "I_DIRTY_DATASYNC"}, \ {I_DIRTY_PAGES, "I_DIRTY_PAGES"}, \ {I_NEW, "I_NEW"}, \ {I_WILL_FREE, "I_WILL_FREE"}, \ {I_FREEING, "I_FREEING"}, \ {I_CLEAR, "I_CLEAR"}, \ {I_SYNC, "I_SYNC"}, \ {I_DIRTY_TIME, "I_DIRTY_TIME"}, \ {I_REFERENCED, "I_REFERENCED"} \ ) /* enums need to be exported to user space */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); #define WB_WORK_REASON \ EM( WB_REASON_BACKGROUND, "background") \ EM( WB_REASON_VMSCAN, "vmscan") \ EM( WB_REASON_SYNC, "sync") \ EM( WB_REASON_PERIODIC, "periodic") \ EM( WB_REASON_LAPTOP_TIMER, "laptop_timer") \ EM( WB_REASON_FS_FREE_SPACE, "fs_free_space") \ EMe(WB_REASON_FORKER_THREAD, "forker_thread") WB_WORK_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_page_template, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(pgoff_t, index) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(mapping ? inode_to_bdi(mapping->host) : NULL), 32); __entry->ino = mapping ? mapping->host->i_ino : 0; __entry->index = page->index; ), TP_printk("bdi %s: ino=%lu index=%lu", __entry->name, (unsigned long)__entry->ino, __entry->index ) ); DEFINE_EVENT(writeback_page_template, writeback_dirty_page, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping) ); DEFINE_EVENT(writeback_page_template, wait_on_page_writeback, TP_PROTO(struct page *page, struct address_space *mapping), TP_ARGS(page, mapping) ); DECLARE_EVENT_CLASS(writeback_dirty_inode_template, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, flags) ), TP_fast_assign( struct backing_dev_info *bdi = inode_to_bdi(inode); /* may be called for files on pseudo FSes w/ unregistered bdi */ strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->flags = flags; ), TP_printk("bdi %s: ino=%lu state=%s flags=%s", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), show_inode_state(__entry->flags) ) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode, TP_PROTO(struct inode *inode, int flags), TP_ARGS(inode, flags) ); #ifdef CREATE_TRACE_POINTS #ifdef CONFIG_CGROUP_WRITEBACK static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return cgroup_ino(wb->memcg_css->cgroup); } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { if (wbc->wb) return __trace_wb_assign_cgroup(wbc->wb); else return 1; } #else /* CONFIG_CGROUP_WRITEBACK */ static inline ino_t __trace_wb_assign_cgroup(struct bdi_writeback *wb) { return 1; } static inline ino_t __trace_wbc_assign_cgroup(struct writeback_control *wbc) { return 1; } #endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CREATE_TRACE_POINTS */ #ifdef CONFIG_CGROUP_WRITEBACK TRACE_EVENT(inode_foreign_history, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned int history), TP_ARGS(inode, wbc, history), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, cgroup_ino) __field(unsigned int, history) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); __entry->history = history; ), TP_printk("bdi %s: ino=%lu cgroup_ino=%lu history=0x%x", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->cgroup_ino, __entry->history ) ); TRACE_EVENT(inode_switch_wbs, TP_PROTO(struct inode *inode, struct bdi_writeback *old_wb, struct bdi_writeback *new_wb), TP_ARGS(inode, old_wb, new_wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(ino_t, old_cgroup_ino) __field(ino_t, new_cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(old_wb->bdi), 32); __entry->ino = inode->i_ino; __entry->old_cgroup_ino = __trace_wb_assign_cgroup(old_wb); __entry->new_cgroup_ino = __trace_wb_assign_cgroup(new_wb); ), TP_printk("bdi %s: ino=%lu old_cgroup_ino=%lu new_cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, (unsigned long)__entry->old_cgroup_ino, (unsigned long)__entry->new_cgroup_ino ) ); TRACE_EVENT(track_foreign_dirty, TP_PROTO(struct page *page, struct bdi_writeback *wb), TP_ARGS(page, wb), TP_STRUCT__entry( __array(char, name, 32) __field(u64, bdi_id) __field(ino_t, ino) __field(unsigned int, memcg_id) __field(ino_t, cgroup_ino) __field(ino_t, page_cgroup_ino) ), TP_fast_assign( struct address_space *mapping = page_mapping(page); struct inode *inode = mapping ? mapping->host : NULL; strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->bdi_id = wb->bdi->id; __entry->ino = inode ? inode->i_ino : 0; __entry->memcg_id = wb->memcg_css->id; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->page_cgroup_ino = cgroup_ino(page->mem_cgroup->css.cgroup); ), TP_printk("bdi %s[%llu]: ino=%lu memcg_id=%u cgroup_ino=%lu page_cgroup_ino=%lu", __entry->name, __entry->bdi_id, (unsigned long)__entry->ino, __entry->memcg_id, (unsigned long)__entry->cgroup_ino, (unsigned long)__entry->page_cgroup_ino ) ); TRACE_EVENT(flush_foreign, TP_PROTO(struct bdi_writeback *wb, unsigned int frn_bdi_id, unsigned int frn_memcg_id), TP_ARGS(wb, frn_bdi_id, frn_memcg_id), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) __field(unsigned int, frn_bdi_id) __field(unsigned int, frn_memcg_id) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); __entry->frn_bdi_id = frn_bdi_id; __entry->frn_memcg_id = frn_memcg_id; ), TP_printk("bdi %s: cgroup_ino=%lu frn_bdi_id=%u frn_memcg_id=%u", __entry->name, (unsigned long)__entry->cgroup_ino, __entry->frn_bdi_id, __entry->frn_memcg_id ) ); #endif DECLARE_EVENT_CLASS(writeback_write_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry ( __array(char, name, 32) __field(ino_t, ino) __field(int, sync_mode) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->sync_mode = wbc->sync_mode; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, __entry->sync_mode, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc) ); DECLARE_EVENT_CLASS(writeback_work_class, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), TP_ARGS(wb, work), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_pages) __field(dev_t, sb_dev) __field(int, sync_mode) __field(int, for_kupdate) __field(int, range_cyclic) __field(int, for_background) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->nr_pages = work->nr_pages; __entry->sb_dev = work->sb ? work->sb->s_dev : 0; __entry->sync_mode = work->sync_mode; __entry->for_kupdate = work->for_kupdate; __entry->range_cyclic = work->range_cyclic; __entry->for_background = work->for_background; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d " "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup_ino=%lu", __entry->name, MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev), __entry->nr_pages, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ DEFINE_EVENT(writeback_work_class, name, \ TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \ TP_ARGS(wb, work)) DEFINE_WRITEBACK_WORK_EVENT(writeback_queue); DEFINE_WRITEBACK_WORK_EVENT(writeback_exec); DEFINE_WRITEBACK_WORK_EVENT(writeback_start); DEFINE_WRITEBACK_WORK_EVENT(writeback_written); DEFINE_WRITEBACK_WORK_EVENT(writeback_wait); TRACE_EVENT(writeback_pages_written, TP_PROTO(long pages_written), TP_ARGS(pages_written), TP_STRUCT__entry( __field(long, pages) ), TP_fast_assign( __entry->pages = pages_written; ), TP_printk("%ld", __entry->pages) ); DECLARE_EVENT_CLASS(writeback_class, TP_PROTO(struct bdi_writeback *wb), TP_ARGS(wb), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: cgroup_ino=%lu", __entry->name, (unsigned long)__entry->cgroup_ino ) ); #define DEFINE_WRITEBACK_EVENT(name) \ DEFINE_EVENT(writeback_class, name, \ TP_PROTO(struct bdi_writeback *wb), \ TP_ARGS(wb)) DEFINE_WRITEBACK_EVENT(writeback_wake_background); TRACE_EVENT(writeback_bdi_register, TP_PROTO(struct backing_dev_info *bdi), TP_ARGS(bdi), TP_STRUCT__entry( __array(char, name, 32) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); ), TP_printk("bdi %s", __entry->name ) ); DECLARE_EVENT_CLASS(wbc_class, TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), TP_ARGS(wbc, bdi), TP_STRUCT__entry( __array(char, name, 32) __field(long, nr_to_write) __field(long, pages_skipped) __field(int, sync_mode) __field(int, for_kupdate) __field(int, for_background) __field(int, for_reclaim) __field(int, range_cyclic) __field(long, range_start) __field(long, range_end) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(bdi), 32); __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->for_background = wbc->for_background; __entry->for_reclaim = wbc->for_reclaim; __entry->range_cyclic = wbc->range_cyclic; __entry->range_start = (long)wbc->range_start; __entry->range_end = (long)wbc->range_end; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d " "bgrd=%d reclm=%d cyclic=%d " "start=0x%lx end=0x%lx cgroup_ino=%lu", __entry->name, __entry->nr_to_write, __entry->pages_skipped, __entry->sync_mode, __entry->for_kupdate, __entry->for_background, __entry->for_reclaim, __entry->range_cyclic, __entry->range_start, __entry->range_end, (unsigned long)__entry->cgroup_ino ) ) #define DEFINE_WBC_EVENT(name) \ DEFINE_EVENT(wbc_class, name, \ TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \ TP_ARGS(wbc, bdi)) DEFINE_WBC_EVENT(wbc_writepage); TRACE_EVENT(writeback_queue_io, TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work, unsigned long dirtied_before, int moved), TP_ARGS(wb, work, dirtied_before, moved), TP_STRUCT__entry( __array(char, name, 32) __field(unsigned long, older) __field(long, age) __field(int, moved) __field(int, reason) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(wb->bdi), 32); __entry->older = dirtied_before; __entry->age = (jiffies - dirtied_before) * 1000 / HZ; __entry->moved = moved; __entry->reason = work->reason; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup_ino=%lu", __entry->name, __entry->older, /* dirtied_before in jiffies */ __entry->age, /* dirtied_before in relative milliseconds */ __entry->moved, __print_symbolic(__entry->reason, WB_WORK_REASON), (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(global_dirty_state, TP_PROTO(unsigned long background_thresh, unsigned long dirty_thresh ), TP_ARGS(background_thresh, dirty_thresh ), TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) __field(unsigned long, nr_dirtied) __field(unsigned long, nr_written) ), TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; __entry->dirty_thresh = dirty_thresh; __entry->dirty_limit = global_wb_domain.dirty_limit; ), TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, __entry->nr_dirtied, __entry->nr_written ) ); #define KBps(x) ((x) << (PAGE_SHIFT - 10)) TRACE_EVENT(bdi_dirty_ratelimit, TP_PROTO(struct bdi_writeback *wb, unsigned long dirty_rate, unsigned long task_ratelimit), TP_ARGS(wb, dirty_rate, task_ratelimit), TP_STRUCT__entry( __array(char, bdi, 32) __field(unsigned long, write_bw) __field(unsigned long, avg_write_bw) __field(unsigned long, dirty_rate) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned long, balanced_dirty_ratelimit) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->write_bw = KBps(wb->write_bandwidth); __entry->avg_write_bw = KBps(wb->avg_write_bandwidth); __entry->dirty_rate = KBps(dirty_rate); __entry->dirty_ratelimit = KBps(wb->dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->balanced_dirty_ratelimit = KBps(wb->balanced_dirty_ratelimit); __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "write_bw=%lu awrite_bw=%lu dirty_rate=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "balanced_dirty_ratelimit=%lu cgroup_ino=%lu", __entry->bdi, __entry->write_bw, /* write bandwidth */ __entry->avg_write_bw, /* avg write bandwidth */ __entry->dirty_rate, /* bdi dirty rate */ __entry->dirty_ratelimit, /* base ratelimit */ __entry->task_ratelimit, /* ratelimit with position control */ __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(balance_dirty_pages, TP_PROTO(struct bdi_writeback *wb, unsigned long thresh, unsigned long bg_thresh, unsigned long dirty, unsigned long bdi_thresh, unsigned long bdi_dirty, unsigned long dirty_ratelimit, unsigned long task_ratelimit, unsigned long dirtied, unsigned long period, long pause, unsigned long start_time), TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty, dirty_ratelimit, task_ratelimit, dirtied, period, pause, start_time), TP_STRUCT__entry( __array( char, bdi, 32) __field(unsigned long, limit) __field(unsigned long, setpoint) __field(unsigned long, dirty) __field(unsigned long, bdi_setpoint) __field(unsigned long, bdi_dirty) __field(unsigned long, dirty_ratelimit) __field(unsigned long, task_ratelimit) __field(unsigned int, dirtied) __field(unsigned int, dirtied_pause) __field(unsigned long, paused) __field( long, pause) __field(unsigned long, period) __field( long, think) __field(ino_t, cgroup_ino) ), TP_fast_assign( unsigned long freerun = (thresh + bg_thresh) / 2; strscpy_pad(__entry->bdi, bdi_dev_name(wb->bdi), 32); __entry->limit = global_wb_domain.dirty_limit; __entry->setpoint = (global_wb_domain.dirty_limit + freerun) / 2; __entry->dirty = dirty; __entry->bdi_setpoint = __entry->setpoint * bdi_thresh / (thresh + 1); __entry->bdi_dirty = bdi_dirty; __entry->dirty_ratelimit = KBps(dirty_ratelimit); __entry->task_ratelimit = KBps(task_ratelimit); __entry->dirtied = dirtied; __entry->dirtied_pause = current->nr_dirtied_pause; __entry->think = current->dirty_paused_when == 0 ? 0 : (long)(jiffies - current->dirty_paused_when) * 1000/HZ; __entry->period = period * 1000 / HZ; __entry->pause = pause * 1000 / HZ; __entry->paused = (jiffies - start_time) * 1000 / HZ; __entry->cgroup_ino = __trace_wb_assign_cgroup(wb); ), TP_printk("bdi %s: " "limit=%lu setpoint=%lu dirty=%lu " "bdi_setpoint=%lu bdi_dirty=%lu " "dirty_ratelimit=%lu task_ratelimit=%lu " "dirtied=%u dirtied_pause=%u " "paused=%lu pause=%ld period=%lu think=%ld cgroup_ino=%lu", __entry->bdi, __entry->limit, __entry->setpoint, __entry->dirty, __entry->bdi_setpoint, __entry->bdi_dirty, __entry->dirty_ratelimit, __entry->task_ratelimit, __entry->dirtied, __entry->dirtied_pause, __entry->paused, /* ms */ __entry->pause, /* ms */ __entry->period, /* ms */ __entry->think, /* ms */ (unsigned long)__entry->cgroup_ino ) ); TRACE_EVENT(writeback_sb_inodes_requeue, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->cgroup_ino = __trace_wb_assign_cgroup(inode_to_wb(inode)); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, (unsigned long)__entry->cgroup_ino ) ); DECLARE_EVENT_CLASS(writeback_congest_waited_template, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed), TP_STRUCT__entry( __field( unsigned int, usec_timeout ) __field( unsigned int, usec_delayed ) ), TP_fast_assign( __entry->usec_timeout = usec_timeout; __entry->usec_delayed = usec_delayed; ), TP_printk("usec_timeout=%u usec_delayed=%u", __entry->usec_timeout, __entry->usec_delayed) ); DEFINE_EVENT(writeback_congest_waited_template, writeback_congestion_wait, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed) ); DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested, TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed), TP_ARGS(usec_timeout, usec_delayed) ); DECLARE_EVENT_CLASS(writeback_single_inode_template, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write ), TP_ARGS(inode, wbc, nr_to_write), TP_STRUCT__entry( __array(char, name, 32) __field(ino_t, ino) __field(unsigned long, state) __field(unsigned long, dirtied_when) __field(unsigned long, writeback_index) __field(long, nr_to_write) __field(unsigned long, wrote) __field(ino_t, cgroup_ino) ), TP_fast_assign( strscpy_pad(__entry->name, bdi_dev_name(inode_to_bdi(inode)), 32); __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->dirtied_when = inode->dirtied_when; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->nr_to_write = nr_to_write; __entry->wrote = nr_to_write - wbc->nr_to_write; __entry->cgroup_ino = __trace_wbc_assign_cgroup(wbc); ), TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu " "index=%lu to_write=%ld wrote=%lu cgroup_ino=%lu", __entry->name, (unsigned long)__entry->ino, show_inode_state(__entry->state), __entry->dirtied_when, (jiffies - __entry->dirtied_when) / HZ, __entry->writeback_index, __entry->nr_to_write, __entry->wrote, (unsigned long)__entry->cgroup_ino ) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_start, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode, TP_PROTO(struct inode *inode, struct writeback_control *wbc, unsigned long nr_to_write), TP_ARGS(inode, wbc, nr_to_write) ); DECLARE_EVENT_CLASS(writeback_inode_template, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, state ) __field( __u16, mode ) __field(unsigned long, dirtied_when ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->state = inode->i_state; __entry->mode = inode->i_mode; __entry->dirtied_when = inode->dirtied_when; ), TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long)__entry->ino, __entry->dirtied_when, show_inode_state(__entry->state), __entry->mode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_lazytime_iput, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, writeback_dirty_inode_enqueue, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* * Inode writeback list tracking. */ DEFINE_EVENT(writeback_inode_template, sb_mark_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(writeback_inode_template, sb_clear_inode_writeback, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); #endif /* _TRACE_WRITEBACK_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 /* SPDX-License-Identifier: GPL-2.0-only */ /* * kernfs.h - pseudo filesystem decoupled from vfs locking */ #ifndef __LINUX_KERNFS_H #define __LINUX_KERNFS_H #include <linux/kernel.h> #include <linux/err.h> #include <linux/list.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/lockdep.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/uidgid.h> #include <linux/wait.h> struct file; struct dentry; struct iattr; struct seq_file; struct vm_area_struct; struct super_block; struct file_system_type; struct poll_table_struct; struct fs_context; struct kernfs_fs_context; struct kernfs_open_node; struct kernfs_iattrs; enum kernfs_node_type { KERNFS_DIR = 0x0001, KERNFS_FILE = 0x0002, KERNFS_LINK = 0x0004, }; #define KERNFS_TYPE_MASK 0x000f #define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK #define KERNFS_MAX_USER_XATTRS 128 #define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10) enum kernfs_node_flag { KERNFS_ACTIVATED = 0x0010, KERNFS_NS = 0x0020, KERNFS_HAS_SEQ_SHOW = 0x0040, KERNFS_HAS_MMAP = 0x0080, KERNFS_LOCKDEP = 0x0100, KERNFS_SUICIDAL = 0x0400, KERNFS_SUICIDED = 0x0800, KERNFS_EMPTY_DIR = 0x1000, KERNFS_HAS_RELEASE = 0x2000, }; /* @flags for kernfs_create_root() */ enum kernfs_root_flag { /* * kernfs_nodes are created in the deactivated state and invisible. * They require explicit kernfs_activate() to become visible. This * can be used to make related nodes become visible atomically * after all nodes are created successfully. */ KERNFS_ROOT_CREATE_DEACTIVATED = 0x0001, /* * For regular files, if the opener has CAP_DAC_OVERRIDE, open(2) * succeeds regardless of the RW permissions. sysfs had an extra * layer of enforcement where open(2) fails with -EACCES regardless * of CAP_DAC_OVERRIDE if the permission doesn't have the * respective read or write access at all (none of S_IRUGO or * S_IWUGO) or the respective operation isn't implemented. The * following flag enables that behavior. */ KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK = 0x0002, /* * The filesystem supports exportfs operation, so userspace can use * fhandle to access nodes of the fs. */ KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004, /* * Support user xattrs to be written to nodes rooted at this root. */ KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008, }; /* type-specific structures for kernfs_node union members */ struct kernfs_elem_dir { unsigned long subdirs; /* children rbtree starts here and goes through kn->rb */ struct rb_root children; /* * The kernfs hierarchy this directory belongs to. This fits * better directly in kernfs_node but is here to save space. */ struct kernfs_root *root; }; struct kernfs_elem_symlink { struct kernfs_node *target_kn; }; struct kernfs_elem_attr { const struct kernfs_ops *ops; struct kernfs_open_node *open; loff_t size; struct kernfs_node *notify_next; /* for kernfs_notify() */ }; /* * kernfs_node - the building block of kernfs hierarchy. Each and every * kernfs node is represented by single kernfs_node. Most fields are * private to kernfs and shouldn't be accessed directly by kernfs users. * * As long as s_count reference is held, the kernfs_node itself is * accessible. Dereferencing elem or any other outer entity requires * active reference. */ struct kernfs_node { atomic_t count; atomic_t active; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif /* * Use kernfs_get_parent() and kernfs_name/path() instead of * accessing the following two fields directly. If the node is * never moved to a different parent, it is safe to access the * parent directly. */ struct kernfs_node *parent; const char *name; struct rb_node rb; const void *ns; /* namespace tag */ unsigned int hash; /* ns + name hash */ union { struct kernfs_elem_dir dir; struct kernfs_elem_symlink symlink; struct kernfs_elem_attr attr; }; void *priv; /* * 64bit unique ID. On 64bit ino setups, id is the ino. On 32bit, * the low 32bits are ino and upper generation. */ u64 id; unsigned short flags; umode_t mode; struct kernfs_iattrs *iattr; }; /* * kernfs_syscall_ops may be specified on kernfs_create_root() to support * syscalls. These optional callbacks are invoked on the matching syscalls * and can perform any kernfs operations which don't necessarily have to be * the exact operation requested. An active reference is held for each * kernfs_node parameter. */ struct kernfs_syscall_ops { int (*show_options)(struct seq_file *sf, struct kernfs_root *root); int (*mkdir)(struct kernfs_node *parent, const char *name, umode_t mode); int (*rmdir)(struct kernfs_node *kn); int (*rename)(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name); int (*show_path)(struct seq_file *sf, struct kernfs_node *kn, struct kernfs_root *root); }; struct kernfs_root { /* published fields */ struct kernfs_node *kn; unsigned int flags; /* KERNFS_ROOT_* flags */ /* private fields, do not use outside kernfs proper */ struct idr ino_idr; u32 last_id_lowbits; u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; /* list of kernfs_super_info of this root, protected by kernfs_mutex */ struct list_head supers; wait_queue_head_t deactivate_waitq; }; struct kernfs_open_file { /* published fields */ struct kernfs_node *kn; struct file *file; struct seq_file *seq_file; void *priv; /* private fields, do not use outside kernfs proper */ struct mutex mutex; struct mutex prealloc_mutex; int event; struct list_head list; char *prealloc_buf; size_t atomic_write_len; bool mmapped:1; bool released:1; const struct vm_operations_struct *vm_ops; }; struct kernfs_ops { /* * Optional open/release methods. Both are called with * @of->seq_file populated. */ int (*open)(struct kernfs_open_file *of); void (*release)(struct kernfs_open_file *of); /* * Read is handled by either seq_file or raw_read(). * * If seq_show() is present, seq_file path is active. Other seq * operations are optional and if not implemented, the behavior is * equivalent to single_open(). @sf->private points to the * associated kernfs_open_file. * * read() is bounced through kernel buffer and a read larger than * PAGE_SIZE results in partial operation of PAGE_SIZE. */ int (*seq_show)(struct seq_file *sf, void *v); void *(*seq_start)(struct seq_file *sf, loff_t *ppos); void *(*seq_next)(struct seq_file *sf, void *v, loff_t *ppos); void (*seq_stop)(struct seq_file *sf, void *v); ssize_t (*read)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); /* * write() is bounced through kernel buffer. If atomic_write_len * is not set, a write larger than PAGE_SIZE results in partial * operations of PAGE_SIZE chunks. If atomic_write_len is set, * writes upto the specified size are executed atomically but * larger ones are rejected with -E2BIG. */ size_t atomic_write_len; /* * "prealloc" causes a buffer to be allocated at open for * all read/write requests. As ->seq_show uses seq_read() * which does its own allocation, it is incompatible with * ->prealloc. Provide ->read and ->write with ->prealloc. */ bool prealloc; ssize_t (*write)(struct kernfs_open_file *of, char *buf, size_t bytes, loff_t off); __poll_t (*poll)(struct kernfs_open_file *of, struct poll_table_struct *pt); int (*mmap)(struct kernfs_open_file *of, struct vm_area_struct *vma); #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lock_class_key lockdep_key; #endif }; /* * The kernfs superblock creation/mount parameter context. */ struct kernfs_fs_context { struct kernfs_root *root; /* Root of the hierarchy being mounted */ void *ns_tag; /* Namespace tag of the mount (or NULL) */ unsigned long magic; /* File system specific magic number */ /* The following are set/used by kernfs_mount() */ bool new_sb_created; /* Set to T if we allocated a new sb */ }; #ifdef CONFIG_KERNFS static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return kn->flags & KERNFS_TYPE_MASK; } static inline ino_t kernfs_id_ino(u64 id) { /* id is ino if ino_t is 64bit; otherwise, low 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return id; else return (u32)id; } static inline u32 kernfs_id_gen(u64 id) { /* gen is fixed at 1 if ino_t is 64bit; otherwise, high 32bits */ if (sizeof(ino_t) >= sizeof(u64)) return 1; else return id >> 32; } static inline ino_t kernfs_ino(struct kernfs_node *kn) { return kernfs_id_ino(kn->id); } static inline ino_t kernfs_gen(struct kernfs_node *kn) { return kernfs_id_gen(kn->id); } /** * kernfs_enable_ns - enable namespace under a directory * @kn: directory of interest, should be empty * * This is to be called right after @kn is created to enable namespace * under it. All children of @kn must have non-NULL namespace tags and * only the ones which match the super_block's tag will be visible. */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { WARN_ON_ONCE(kernfs_type(kn) != KERNFS_DIR); WARN_ON_ONCE(!RB_EMPTY_ROOT(&kn->dir.children)); kn->flags |= KERNFS_NS; } /** * kernfs_ns_enabled - test whether namespace is enabled * @kn: the node to test * * Test whether namespace filtering is enabled for the children of @ns. */ static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return kn->flags & KERNFS_NS; } int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen); int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen); void pr_cont_kernfs_name(struct kernfs_node *kn); void pr_cont_kernfs_path(struct kernfs_node *kn); struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn); struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns); struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns); void kernfs_get(struct kernfs_node *kn); void kernfs_put(struct kernfs_node *kn); struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry); struct kernfs_root *kernfs_root_from_sb(struct super_block *sb); struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); struct dentry *kernfs_node_dentry(struct kernfs_node *kn, struct super_block *sb); struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv); void kernfs_destroy_root(struct kernfs_root *root); struct kernfs_node *kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns); struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, const char *name); struct kernfs_node *__kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key); struct kernfs_node *kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target); void kernfs_activate(struct kernfs_node *kn); void kernfs_remove(struct kernfs_node *kn); void kernfs_break_active_protection(struct kernfs_node *kn); void kernfs_unbreak_active_protection(struct kernfs_node *kn); bool kernfs_remove_self(struct kernfs_node *kn); int kernfs_remove_by_name_ns(struct kernfs_node *parent, const char *name, const void *ns); int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns); int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); __poll_t kernfs_generic_poll(struct kernfs_open_file *of, struct poll_table_struct *pt); void kernfs_notify(struct kernfs_node *kn); int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size); int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags); const void *kernfs_super_ns(struct super_block *sb); int kernfs_get_tree(struct fs_context *fc); void kernfs_free_fs_context(struct fs_context *fc); void kernfs_kill_sb(struct super_block *sb); void kernfs_init(void); struct kernfs_node *kernfs_find_and_get_node_by_id(struct kernfs_root *root, u64 id); #else /* CONFIG_KERNFS */ static inline enum kernfs_node_type kernfs_type(struct kernfs_node *kn) { return 0; } /* whatever */ static inline void kernfs_enable_ns(struct kernfs_node *kn) { } static inline bool kernfs_ns_enabled(struct kernfs_node *kn) { return false; } static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline int kernfs_path_from_node(struct kernfs_node *root_kn, struct kernfs_node *kn, char *buf, size_t buflen) { return -ENOSYS; } static inline void pr_cont_kernfs_name(struct kernfs_node *kn) { } static inline void pr_cont_kernfs_path(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { return NULL; } static inline struct kernfs_node * kernfs_find_and_get_ns(struct kernfs_node *parent, const char *name, const void *ns) { return NULL; } static inline struct kernfs_node * kernfs_walk_and_get_ns(struct kernfs_node *parent, const char *path, const void *ns) { return NULL; } static inline void kernfs_get(struct kernfs_node *kn) { } static inline void kernfs_put(struct kernfs_node *kn) { } static inline struct kernfs_node *kernfs_node_from_dentry(struct dentry *dentry) { return NULL; } static inline struct kernfs_root *kernfs_root_from_sb(struct super_block *sb) { return NULL; } static inline struct inode * kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn) { return NULL; } static inline struct kernfs_root * kernfs_create_root(struct kernfs_syscall_ops *scops, unsigned int flags, void *priv) { return ERR_PTR(-ENOSYS); } static inline void kernfs_destroy_root(struct kernfs_root *root) { } static inline struct kernfs_node * kernfs_create_dir_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, void *priv, const void *ns) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * __kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns, struct lock_class_key *key) { return ERR_PTR(-ENOSYS); } static inline struct kernfs_node * kernfs_create_link(struct kernfs_node *parent, const char *name, struct kernfs_node *target) { return ERR_PTR(-ENOSYS); } static inline void kernfs_activate(struct kernfs_node *kn) { } static inline void kernfs_remove(struct kernfs_node *kn) { } static inline bool kernfs_remove_self(struct kernfs_node *kn) { return false; } static inline int kernfs_remove_by_name_ns(struct kernfs_node *kn, const char *name, const void *ns) { return -ENOSYS; } static inline int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name, const void *new_ns) { return -ENOSYS; } static inline int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr) { return -ENOSYS; } static inline void kernfs_notify(struct kernfs_node *kn) { } static inline int kernfs_xattr_get(struct kernfs_node *kn, const char *name, void *value, size_t size) { return -ENOSYS; } static inline int kernfs_xattr_set(struct kernfs_node *kn, const char *name, const void *value, size_t size, int flags) { return -ENOSYS; } static inline const void *kernfs_super_ns(struct super_block *sb) { return NULL; } static inline int kernfs_get_tree(struct fs_context *fc) { return -ENOSYS; } static inline void kernfs_free_fs_context(struct fs_context *fc) { } static inline void kernfs_kill_sb(struct super_block *sb) { } static inline void kernfs_init(void) { } #endif /* CONFIG_KERNFS */ /** * kernfs_path - build full path of a given node * @kn: kernfs_node of interest * @buf: buffer to copy @kn's name into * @buflen: size of @buf * * If @kn is NULL result will be "(null)". * * Returns the length of the full path. If the full length is equal to or * greater than @buflen, @buf contains the truncated path with the trailing * '\0'. On error, -errno is returned. */ static inline int kernfs_path(struct kernfs_node *kn, char *buf, size_t buflen) { return kernfs_path_from_node(kn, NULL, buf, buflen); } static inline struct kernfs_node * kernfs_find_and_get(struct kernfs_node *kn, const char *name) { return kernfs_find_and_get_ns(kn, name, NULL); } static inline struct kernfs_node * kernfs_walk_and_get(struct kernfs_node *kn, const char *path) { return kernfs_walk_and_get_ns(kn, path, NULL); } static inline struct kernfs_node * kernfs_create_dir(struct kernfs_node *parent, const char *name, umode_t mode, void *priv) { return kernfs_create_dir_ns(parent, name, mode, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, priv, NULL); } static inline struct kernfs_node * kernfs_create_file_ns(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, loff_t size, const struct kernfs_ops *ops, void *priv, const void *ns) { struct lock_class_key *key = NULL; #ifdef CONFIG_DEBUG_LOCK_ALLOC key = (struct lock_class_key *)&ops->lockdep_key; #endif return __kernfs_create_file(parent, name, mode, uid, gid, size, ops, priv, ns, key); } static inline struct kernfs_node * kernfs_create_file(struct kernfs_node *parent, const char *name, umode_t mode, loff_t size, const struct kernfs_ops *ops, void *priv) { return kernfs_create_file_ns(parent, name, mode, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, size, ops, priv, NULL); } static inline int kernfs_remove_by_name(struct kernfs_node *parent, const char *name) { return kernfs_remove_by_name_ns(parent, name, NULL); } static inline int kernfs_rename(struct kernfs_node *kn, struct kernfs_node *new_parent, const char *new_name) { return kernfs_rename_ns(kn, new_parent, new_name, NULL); } #endif /* __LINUX_KERNFS_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 // SPDX-License-Identifier: GPL-2.0 /* * Kernel internal timers * * Copyright (C) 1991, 1992 Linus Torvalds * * 1997-01-28 Modified by Finn Arne Gangstad to make timers scale better. * * 1997-09-10 Updated NTP code according to technical memorandum Jan '96 * "A Kernel Model for Precision Timekeeping" by Dave Mills * 1998-12-24 Fixed a xtime SMP race (we need the xtime_lock rw spinlock to * serialize accesses to xtime/lost_ticks). * Copyright (C) 1998 Andrea Arcangeli * 1999-03-10 Improved NTP compatibility by Ulrich Windl * 2002-05-31 Move sys_sysinfo here and make its locking sane, Robert Love * 2000-10-05 Implemented scalable SMP per-CPU timer handling. * Copyright (C) 2000, 2001, 2002 Ingo Molnar * Designed by David S. Miller, Alexey Kuznetsov and Ingo Molnar */ #include <linux/kernel_stat.h> #include <linux/export.h> #include <linux/interrupt.h> #include <linux/percpu.h> #include <linux/init.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/pid_namespace.h> #include <linux/notifier.h> #include <linux/thread_info.h> #include <linux/time.h> #include <linux/jiffies.h> #include <linux/posix-timers.h> #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/delay.h> #include <linux/tick.h> #include <linux/kallsyms.h> #include <linux/irq_work.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/random.h> #include <linux/uaccess.h> #include <asm/unistd.h> #include <asm/div64.h> #include <asm/timex.h> #include <asm/io.h> #include "tick-internal.h" #define CREATE_TRACE_POINTS #include <trace/events/timer.h> __visible u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES; EXPORT_SYMBOL(jiffies_64); /* * The timer wheel has LVL_DEPTH array levels. Each level provides an array of * LVL_SIZE buckets. Each level is driven by its own clock and therefor each * level has a different granularity. * * The level granularity is: LVL_CLK_DIV ^ lvl * The level clock frequency is: HZ / (LVL_CLK_DIV ^ level) * * The array level of a newly armed timer depends on the relative expiry * time. The farther the expiry time is away the higher the array level and * therefor the granularity becomes. * * Contrary to the original timer wheel implementation, which aims for 'exact' * expiry of the timers, this implementation removes the need for recascading * the timers into the lower array levels. The previous 'classic' timer wheel * implementation of the kernel already violated the 'exact' expiry by adding * slack to the expiry time to provide batched expiration. The granularity * levels provide implicit batching. * * This is an optimization of the original timer wheel implementation for the * majority of the timer wheel use cases: timeouts. The vast majority of * timeout timers (networking, disk I/O ...) are canceled before expiry. If * the timeout expires it indicates that normal operation is disturbed, so it * does not matter much whether the timeout comes with a slight delay. * * The only exception to this are networking timers with a small expiry * time. They rely on the granularity. Those fit into the first wheel level, * which has HZ granularity. * * We don't have cascading anymore. timers with a expiry time above the * capacity of the last wheel level are force expired at the maximum timeout * value of the last wheel level. From data sampling we know that the maximum * value observed is 5 days (network connection tracking), so this should not * be an issue. * * The currently chosen array constants values are a good compromise between * array size and granularity. * * This results in the following granularity and range levels: * * HZ 1000 steps * Level Offset Granularity Range * 0 0 1 ms 0 ms - 63 ms * 1 64 8 ms 64 ms - 511 ms * 2 128 64 ms 512 ms - 4095 ms (512ms - ~4s) * 3 192 512 ms 4096 ms - 32767 ms (~4s - ~32s) * 4 256 4096 ms (~4s) 32768 ms - 262143 ms (~32s - ~4m) * 5 320 32768 ms (~32s) 262144 ms - 2097151 ms (~4m - ~34m) * 6 384 262144 ms (~4m) 2097152 ms - 16777215 ms (~34m - ~4h) * 7 448 2097152 ms (~34m) 16777216 ms - 134217727 ms (~4h - ~1d) * 8 512 16777216 ms (~4h) 134217728 ms - 1073741822 ms (~1d - ~12d) * * HZ 300 * Level Offset Granularity Range * 0 0 3 ms 0 ms - 210 ms * 1 64 26 ms 213 ms - 1703 ms (213ms - ~1s) * 2 128 213 ms 1706 ms - 13650 ms (~1s - ~13s) * 3 192 1706 ms (~1s) 13653 ms - 109223 ms (~13s - ~1m) * 4 256 13653 ms (~13s) 109226 ms - 873810 ms (~1m - ~14m) * 5 320 109226 ms (~1m) 873813 ms - 6990503 ms (~14m - ~1h) * 6 384 873813 ms (~14m) 6990506 ms - 55924050 ms (~1h - ~15h) * 7 448 6990506 ms (~1h) 55924053 ms - 447392423 ms (~15h - ~5d) * 8 512 55924053 ms (~15h) 447392426 ms - 3579139406 ms (~5d - ~41d) * * HZ 250 * Level Offset Granularity Range * 0 0 4 ms 0 ms - 255 ms * 1 64 32 ms 256 ms - 2047 ms (256ms - ~2s) * 2 128 256 ms 2048 ms - 16383 ms (~2s - ~16s) * 3 192 2048 ms (~2s) 16384 ms - 131071 ms (~16s - ~2m) * 4 256 16384 ms (~16s) 131072 ms - 1048575 ms (~2m - ~17m) * 5 320 131072 ms (~2m) 1048576 ms - 8388607 ms (~17m - ~2h) * 6 384 1048576 ms (~17m) 8388608 ms - 67108863 ms (~2h - ~18h) * 7 448 8388608 ms (~2h) 67108864 ms - 536870911 ms (~18h - ~6d) * 8 512 67108864 ms (~18h) 536870912 ms - 4294967288 ms (~6d - ~49d) * * HZ 100 * Level Offset Granularity Range * 0 0 10 ms 0 ms - 630 ms * 1 64 80 ms 640 ms - 5110 ms (640ms - ~5s) * 2 128 640 ms 5120 ms - 40950 ms (~5s - ~40s) * 3 192 5120 ms (~5s) 40960 ms - 327670 ms (~40s - ~5m) * 4 256 40960 ms (~40s) 327680 ms - 2621430 ms (~5m - ~43m) * 5 320 327680 ms (~5m) 2621440 ms - 20971510 ms (~43m - ~5h) * 6 384 2621440 ms (~43m) 20971520 ms - 167772150 ms (~5h - ~1d) * 7 448 20971520 ms (~5h) 167772160 ms - 1342177270 ms (~1d - ~15d) */ /* Clock divisor for the next level */ #define LVL_CLK_SHIFT 3 #define LVL_CLK_DIV (1UL << LVL_CLK_SHIFT) #define LVL_CLK_MASK (LVL_CLK_DIV - 1) #define LVL_SHIFT(n) ((n) * LVL_CLK_SHIFT) #define LVL_GRAN(n) (1UL << LVL_SHIFT(n)) /* * The time start value for each level to select the bucket at enqueue * time. We start from the last possible delta of the previous level * so that we can later add an extra LVL_GRAN(n) to n (see calc_index()). */ #define LVL_START(n) ((LVL_SIZE - 1) << (((n) - 1) * LVL_CLK_SHIFT)) /* Size of each clock level */ #define LVL_BITS 6 #define LVL_SIZE (1UL << LVL_BITS) #define LVL_MASK (LVL_SIZE - 1) #define LVL_OFFS(n) ((n) * LVL_SIZE) /* Level depth */ #if HZ > 100 # define LVL_DEPTH 9 # else # define LVL_DEPTH 8 #endif /* The cutoff (max. capacity of the wheel) */ #define WHEEL_TIMEOUT_CUTOFF (LVL_START(LVL_DEPTH)) #define WHEEL_TIMEOUT_MAX (WHEEL_TIMEOUT_CUTOFF - LVL_GRAN(LVL_DEPTH - 1)) /* * The resulting wheel size. If NOHZ is configured we allocate two * wheels so we have a separate storage for the deferrable timers. */ #define WHEEL_SIZE (LVL_SIZE * LVL_DEPTH) #ifdef CONFIG_NO_HZ_COMMON # define NR_BASES 2 # define BASE_STD 0 # define BASE_DEF 1 #else # define NR_BASES 1 # define BASE_STD 0 # define BASE_DEF 0 #endif struct timer_base { raw_spinlock_t lock; struct timer_list *running_timer; #ifdef CONFIG_PREEMPT_RT spinlock_t expiry_lock; atomic_t timer_waiters; #endif unsigned long clk; unsigned long next_expiry; unsigned int cpu; bool next_expiry_recalc; bool is_idle; bool timers_pending; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; } ____cacheline_aligned; static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); #ifdef CONFIG_NO_HZ_COMMON static DEFINE_STATIC_KEY_FALSE(timers_nohz_active); static DEFINE_MUTEX(timer_keys_mutex); static void timer_update_keys(struct work_struct *work); static DECLARE_WORK(timer_update_work, timer_update_keys); #ifdef CONFIG_SMP unsigned int sysctl_timer_migration = 1; DEFINE_STATIC_KEY_FALSE(timers_migration_enabled); static void timers_update_migration(void) { if (sysctl_timer_migration && tick_nohz_active) static_branch_enable(&timers_migration_enabled); else static_branch_disable(&timers_migration_enabled); } #else static inline void timers_update_migration(void) { } #endif /* !CONFIG_SMP */ static void timer_update_keys(struct work_struct *work) { mutex_lock(&timer_keys_mutex); timers_update_migration(); static_branch_enable(&timers_nohz_active); mutex_unlock(&timer_keys_mutex); } void timers_update_nohz(void) { schedule_work(&timer_update_work); } int timer_migration_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; mutex_lock(&timer_keys_mutex); ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(); mutex_unlock(&timer_keys_mutex); return ret; } static inline bool is_timers_nohz_active(void) { return static_branch_unlikely(&timers_nohz_active); } #else static inline bool is_timers_nohz_active(void) { return false; } #endif /* NO_HZ_COMMON */ static unsigned long round_jiffies_common(unsigned long j, int cpu, bool force_up) { int rem; unsigned long original = j; /* * We don't want all cpus firing their timers at once hitting the * same lock or cachelines, so we skew each extra cpu with an extra * 3 jiffies. This 3 jiffies came originally from the mm/ code which * already did this. * The skew is done by adding 3*cpunr, then round, then subtract this * extra offset again. */ j += cpu * 3; rem = j % HZ; /* * If the target jiffie is just after a whole second (which can happen * due to delays of the timer irq, long irq off times etc etc) then * we should round down to the whole second, not up. Use 1/4th second * as cutoff for this rounding as an extreme upper bound for this. * But never round down if @force_up is set. */ if (rem < HZ/4 && !force_up) /* round down */ j = j - rem; else /* round up */ j = j - rem + HZ; /* now that we have rounded, subtract the extra skew again */ j -= cpu * 3; /* * Make sure j is still in the future. Otherwise return the * unmodified value. */ return time_is_after_jiffies(j) ? j : original; } /** * __round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, false); } EXPORT_SYMBOL_GPL(__round_jiffies); /** * __round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * __round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The exact rounding is skewed for each processor to avoid all * processors firing at the exact same time, which could lead * to lock contention or spurious cache line bouncing. * * The return value is the rounded version of the @j parameter. */ unsigned long __round_jiffies_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, false) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_relative); /** * round_jiffies - function to round jiffies to a full second * @j: the time in (absolute) jiffies that should be rounded * * round_jiffies() rounds an absolute time in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), false); } EXPORT_SYMBOL_GPL(round_jiffies); /** * round_jiffies_relative - function to round jiffies to a full second * @j: the time in (relative) jiffies that should be rounded * * round_jiffies_relative() rounds a time delta in the future (in jiffies) * up or down to (approximately) full seconds. This is useful for timers * for which the exact time they fire does not matter too much, as long as * they fire approximately every X seconds. * * By rounding these timers to whole seconds, all such timers will fire * at the same time, rather than at various times spread out. The goal * of this is to have the CPU wake up less, which saves power. * * The return value is the rounded version of the @j parameter. */ unsigned long round_jiffies_relative(unsigned long j) { return __round_jiffies_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_relative); /** * __round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up(unsigned long j, int cpu) { return round_jiffies_common(j, cpu, true); } EXPORT_SYMBOL_GPL(__round_jiffies_up); /** * __round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * @cpu: the processor number on which the timeout will happen * * This is the same as __round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long __round_jiffies_up_relative(unsigned long j, int cpu) { unsigned long j0 = jiffies; /* Use j0 because jiffies might change while we run */ return round_jiffies_common(j + j0, cpu, true) - j0; } EXPORT_SYMBOL_GPL(__round_jiffies_up_relative); /** * round_jiffies_up - function to round jiffies up to a full second * @j: the time in (absolute) jiffies that should be rounded * * This is the same as round_jiffies() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up(unsigned long j) { return round_jiffies_common(j, raw_smp_processor_id(), true); } EXPORT_SYMBOL_GPL(round_jiffies_up); /** * round_jiffies_up_relative - function to round jiffies up to a full second * @j: the time in (relative) jiffies that should be rounded * * This is the same as round_jiffies_relative() except that it will never * round down. This is useful for timeouts for which the exact time * of firing does not matter too much, as long as they don't fire too * early. */ unsigned long round_jiffies_up_relative(unsigned long j) { return __round_jiffies_up_relative(j, raw_smp_processor_id()); } EXPORT_SYMBOL_GPL(round_jiffies_up_relative); static inline unsigned int timer_get_idx(struct timer_list *timer) { return (timer->flags & TIMER_ARRAYMASK) >> TIMER_ARRAYSHIFT; } static inline void timer_set_idx(struct timer_list *timer, unsigned int idx) { timer->flags = (timer->flags & ~TIMER_ARRAYMASK) | idx << TIMER_ARRAYSHIFT; } /* * Helper function to calculate the array index for a given expiry * time. */ static inline unsigned calc_index(unsigned long expires, unsigned lvl, unsigned long *bucket_expiry) { /* * The timer wheel has to guarantee that a timer does not fire * early. Early expiry can happen due to: * - Timer is armed at the edge of a tick * - Truncation of the expiry time in the outer wheel levels * * Round up with level granularity to prevent this. */ expires = (expires + LVL_GRAN(lvl)) >> LVL_SHIFT(lvl); *bucket_expiry = expires << LVL_SHIFT(lvl); return LVL_OFFS(lvl) + (expires & LVL_MASK); } static int calc_wheel_index(unsigned long expires, unsigned long clk, unsigned long *bucket_expiry) { unsigned long delta = expires - clk; unsigned int idx; if (delta < LVL_START(1)) { idx = calc_index(expires, 0, bucket_expiry); } else if (delta < LVL_START(2)) { idx = calc_index(expires, 1, bucket_expiry); } else if (delta < LVL_START(3)) { idx = calc_index(expires, 2, bucket_expiry); } else if (delta < LVL_START(4)) { idx = calc_index(expires, 3, bucket_expiry); } else if (delta < LVL_START(5)) { idx = calc_index(expires, 4, bucket_expiry); } else if (delta < LVL_START(6)) { idx = calc_index(expires, 5, bucket_expiry); } else if (delta < LVL_START(7)) { idx = calc_index(expires, 6, bucket_expiry); } else if (LVL_DEPTH > 8 && delta < LVL_START(8)) { idx = calc_index(expires, 7, bucket_expiry); } else if ((long) delta < 0) { idx = clk & LVL_MASK; *bucket_expiry = clk; } else { /* * Force expire obscene large timeouts to expire at the * capacity limit of the wheel. */ if (delta >= WHEEL_TIMEOUT_CUTOFF) expires = clk + WHEEL_TIMEOUT_MAX; idx = calc_index(expires, LVL_DEPTH - 1, bucket_expiry); } return idx; } static void trigger_dyntick_cpu(struct timer_base *base, struct timer_list *timer) { if (!is_timers_nohz_active()) return; /* * TODO: This wants some optimizing similar to the code below, but we * will do that when we switch from push to pull for deferrable timers. */ if (timer->flags & TIMER_DEFERRABLE) { if (tick_nohz_full_cpu(base->cpu)) wake_up_nohz_cpu(base->cpu); return; } /* * We might have to IPI the remote CPU if the base is idle and the * timer is not deferrable. If the other CPU is on the way to idle * then it can't set base->is_idle as we hold the base lock: */ if (base->is_idle) wake_up_nohz_cpu(base->cpu); } /* * Enqueue the timer into the hash bucket, mark it pending in * the bitmap, store the index in the timer flags then wake up * the target CPU if needed. */ static void enqueue_timer(struct timer_base *base, struct timer_list *timer, unsigned int idx, unsigned long bucket_expiry) { hlist_add_head(&timer->entry, base->vectors + idx); __set_bit(idx, base->pending_map); timer_set_idx(timer, idx); trace_timer_start(timer, timer->expires, timer->flags); /* * Check whether this is the new first expiring timer. The * effective expiry time of the timer is required here * (bucket_expiry) instead of timer->expires. */ if (time_before(bucket_expiry, base->next_expiry)) { /* * Set the next expiry time and kick the CPU so it * can reevaluate the wheel: */ base->next_expiry = bucket_expiry; base->timers_pending = true; base->next_expiry_recalc = false; trigger_dyntick_cpu(base, timer); } } static void internal_add_timer(struct timer_base *base, struct timer_list *timer) { unsigned long bucket_expiry; unsigned int idx; idx = calc_wheel_index(timer->expires, base->clk, &bucket_expiry); enqueue_timer(base, timer, idx, bucket_expiry); } #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr timer_debug_descr; static void *timer_debug_hint(void *addr) { return ((struct timer_list *) addr)->function; } static bool timer_is_static_object(void *addr) { struct timer_list *timer = addr; return (timer->entry.pprev == NULL && timer->entry.next == TIMER_ENTRY_STATIC); } /* * fixup_init is called when: * - an active object is initialized */ static bool timer_fixup_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_init(timer, &timer_debug_descr); return true; default: return false; } } /* Stub timer callback for improperly used timers. */ static void stub_timer(struct timer_list *unused) { WARN_ON(1); } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool timer_fixup_activate(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool timer_fixup_free(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: del_timer_sync(timer); debug_object_free(timer, &timer_debug_descr); return true; default: return false; } } /* * fixup_assert_init is called when: * - an untracked/uninit-ed object is found */ static bool timer_fixup_assert_init(void *addr, enum debug_obj_state state) { struct timer_list *timer = addr; switch (state) { case ODEBUG_STATE_NOTAVAILABLE: timer_setup(timer, stub_timer, 0); return true; default: return false; } } static const struct debug_obj_descr timer_debug_descr = { .name = "timer_list", .debug_hint = timer_debug_hint, .is_static_object = timer_is_static_object, .fixup_init = timer_fixup_init, .fixup_activate = timer_fixup_activate, .fixup_free = timer_fixup_free, .fixup_assert_init = timer_fixup_assert_init, }; static inline void debug_timer_init(struct timer_list *timer) { debug_object_init(timer, &timer_debug_descr); } static inline void debug_timer_activate(struct timer_list *timer) { debug_object_activate(timer, &timer_debug_descr); } static inline void debug_timer_deactivate(struct timer_list *timer) { debug_object_deactivate(timer, &timer_debug_descr); } static inline void debug_timer_assert_init(struct timer_list *timer) { debug_object_assert_init(timer, &timer_debug_descr); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key); void init_timer_on_stack_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_object_init_on_stack(timer, &timer_debug_descr); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL_GPL(init_timer_on_stack_key); void destroy_timer_on_stack(struct timer_list *timer) { debug_object_free(timer, &timer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_timer_on_stack); #else static inline void debug_timer_init(struct timer_list *timer) { } static inline void debug_timer_activate(struct timer_list *timer) { } static inline void debug_timer_deactivate(struct timer_list *timer) { } static inline void debug_timer_assert_init(struct timer_list *timer) { } #endif static inline void debug_init(struct timer_list *timer) { debug_timer_init(timer); trace_timer_init(timer); } static inline void debug_deactivate(struct timer_list *timer) { debug_timer_deactivate(timer); trace_timer_cancel(timer); } static inline void debug_assert_init(struct timer_list *timer) { debug_timer_assert_init(timer); } static void do_init_timer(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { timer->entry.pprev = NULL; timer->function = func; if (WARN_ON_ONCE(flags & ~TIMER_INIT_FLAGS)) flags &= TIMER_INIT_FLAGS; timer->flags = flags | raw_smp_processor_id(); lockdep_init_map(&timer->lockdep_map, name, key, 0); } /** * init_timer_key - initialize a timer * @timer: the timer to be initialized * @func: timer callback function * @flags: timer flags * @name: name of the timer * @key: lockdep class key of the fake lock used for tracking timer * sync lock dependencies * * init_timer_key() must be done to a timer prior calling *any* of the * other timer functions. */ void init_timer_key(struct timer_list *timer, void (*func)(struct timer_list *), unsigned int flags, const char *name, struct lock_class_key *key) { debug_init(timer); do_init_timer(timer, func, flags, name, key); } EXPORT_SYMBOL(init_timer_key); static inline void detach_timer(struct timer_list *timer, bool clear_pending) { struct hlist_node *entry = &timer->entry; debug_deactivate(timer); __hlist_del(entry); if (clear_pending) entry->pprev = NULL; entry->next = LIST_POISON2; } static int detach_if_pending(struct timer_list *timer, struct timer_base *base, bool clear_pending) { unsigned idx = timer_get_idx(timer); if (!timer_pending(timer)) return 0; if (hlist_is_singular_node(&timer->entry, base->vectors + idx)) { __clear_bit(idx, base->pending_map); base->next_expiry_recalc = true; } detach_timer(timer, clear_pending); return 1; } static inline struct timer_base *get_timer_cpu_base(u32 tflags, u32 cpu) { struct timer_base *base = per_cpu_ptr(&timer_bases[BASE_STD], cpu); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = per_cpu_ptr(&timer_bases[BASE_DEF], cpu); return base; } static inline struct timer_base *get_timer_this_cpu_base(u32 tflags) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* * If the timer is deferrable and NO_HZ_COMMON is set then we need * to use the deferrable base. */ if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && (tflags & TIMER_DEFERRABLE)) base = this_cpu_ptr(&timer_bases[BASE_DEF]); return base; } static inline struct timer_base *get_timer_base(u32 tflags) { return get_timer_cpu_base(tflags, tflags & TIMER_CPUMASK); } static inline struct timer_base * get_target_base(struct timer_base *base, unsigned tflags) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !(tflags & TIMER_PINNED)) return get_timer_cpu_base(tflags, get_nohz_timer_target()); #endif return get_timer_this_cpu_base(tflags); } static inline void forward_timer_base(struct timer_base *base) { unsigned long jnow = READ_ONCE(jiffies); /* * No need to forward if we are close enough below jiffies. * Also while executing timers, base->clk is 1 offset ahead * of jiffies to avoid endless requeuing to current jffies. */ if ((long)(jnow - base->clk) < 1) return; /* * If the next expiry value is > jiffies, then we fast forward to * jiffies otherwise we forward to the next expiry value. */ if (time_after(base->next_expiry, jnow)) { base->clk = jnow; } else { if (WARN_ON_ONCE(time_before(base->next_expiry, base->clk))) return; base->clk = base->next_expiry; } } /* * We are using hashed locking: Holding per_cpu(timer_bases[x]).lock means * that all timers which are tied to this base are locked, and the base itself * is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found in the base->vectors array. * * When a timer is migrating then the TIMER_MIGRATING flag is set and we need * to wait until the migration is done. */ static struct timer_base *lock_timer_base(struct timer_list *timer, unsigned long *flags) __acquires(timer->base->lock) { for (;;) { struct timer_base *base; u32 tf; /* * We need to use READ_ONCE() here, otherwise the compiler * might re-read @tf between the check for TIMER_MIGRATING * and spin_lock(). */ tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { base = get_timer_base(tf); raw_spin_lock_irqsave(&base->lock, *flags); if (timer->flags == tf) return base; raw_spin_unlock_irqrestore(&base->lock, *flags); } cpu_relax(); } } #define MOD_TIMER_PENDING_ONLY 0x01 #define MOD_TIMER_REDUCE 0x02 #define MOD_TIMER_NOTPENDING 0x04 static inline int __mod_timer(struct timer_list *timer, unsigned long expires, unsigned int options) { unsigned long clk = 0, flags, bucket_expiry; struct timer_base *base, *new_base; unsigned int idx = UINT_MAX; int ret = 0; BUG_ON(!timer->function); /* * This is a common optimization triggered by the networking code - if * the timer is re-modified to have the same timeout or ends up in the * same array bucket then just return: */ if (!(options & MOD_TIMER_NOTPENDING) && timer_pending(timer)) { /* * The downside of this optimization is that it can result in * larger granularity than you would get from adding a new * timer with this expiry. */ long diff = timer->expires - expires; if (!diff) return 1; if (options & MOD_TIMER_REDUCE && diff <= 0) return 1; /* * We lock timer base and calculate the bucket index right * here. If the timer ends up in the same bucket, then we * just update the expiry time and avoid the whole * dequeue/enqueue dance. */ base = lock_timer_base(timer, &flags); forward_timer_base(base); if (timer_pending(timer) && (options & MOD_TIMER_REDUCE) && time_before_eq(timer->expires, expires)) { ret = 1; goto out_unlock; } clk = base->clk; idx = calc_wheel_index(expires, clk, &bucket_expiry); /* * Retrieve and compare the array index of the pending * timer. If it matches set the expiry to the new value so a * subsequent call will exit in the expires check above. */ if (idx == timer_get_idx(timer)) { if (!(options & MOD_TIMER_REDUCE)) timer->expires = expires; else if (time_after(timer->expires, expires)) timer->expires = expires; ret = 1; goto out_unlock; } } else { base = lock_timer_base(timer, &flags); forward_timer_base(base); } ret = detach_if_pending(timer, base, false); if (!ret && (options & MOD_TIMER_PENDING_ONLY)) goto out_unlock; new_base = get_target_base(base, timer->flags); if (base != new_base) { /* * We are trying to schedule the timer on the new base. * However we can't change timer's base while it is running, * otherwise del_timer_sync() can't detect that the timer's * handler yet has not finished. This also guarantees that the * timer is serialized wrt itself. */ if (likely(base->running_timer != timer)) { /* See the comment in lock_timer_base() */ timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | base->cpu); forward_timer_base(base); } } debug_timer_activate(timer); timer->expires = expires; /* * If 'idx' was calculated above and the base time did not advance * between calculating 'idx' and possibly switching the base, only * enqueue_timer() is required. Otherwise we need to (re)calculate * the wheel index via internal_add_timer(). */ if (idx != UINT_MAX && clk == base->clk) enqueue_timer(base, timer, idx, bucket_expiry); else internal_add_timer(base, timer); out_unlock: raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } /** * mod_timer_pending - modify a pending timer's timeout * @timer: the pending timer to be modified * @expires: new timeout in jiffies * * mod_timer_pending() is the same for pending timers as mod_timer(), * but will not re-activate and modify already deleted timers. * * It is useful for unserialized use of timers. */ int mod_timer_pending(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_PENDING_ONLY); } EXPORT_SYMBOL(mod_timer_pending); /** * mod_timer - modify a timer's timeout * @timer: the timer to be modified * @expires: new timeout in jiffies * * mod_timer() is a more efficient way to update the expire field of an * active timer (if the timer is inactive it will be activated) * * mod_timer(timer, expires) is equivalent to: * * del_timer(timer); timer->expires = expires; add_timer(timer); * * Note that if there are multiple unserialized concurrent users of the * same timer, then mod_timer() is the only safe way to modify the timeout, * since add_timer() cannot modify an already running timer. * * The function returns whether it has modified a pending timer or not. * (ie. mod_timer() of an inactive timer returns 0, mod_timer() of an * active timer returns 1.) */ int mod_timer(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, 0); } EXPORT_SYMBOL(mod_timer); /** * timer_reduce - Modify a timer's timeout if it would reduce the timeout * @timer: The timer to be modified * @expires: New timeout in jiffies * * timer_reduce() is very similar to mod_timer(), except that it will only * modify a running timer if that would reduce the expiration time (it will * start a timer that isn't running). */ int timer_reduce(struct timer_list *timer, unsigned long expires) { return __mod_timer(timer, expires, MOD_TIMER_REDUCE); } EXPORT_SYMBOL(timer_reduce); /** * add_timer - start a timer * @timer: the timer to be added * * The kernel will do a ->function(@timer) callback from the * timer interrupt at the ->expires point in the future. The * current time is 'jiffies'. * * The timer's ->expires, ->function fields must be set prior calling this * function. * * Timers with an ->expires field in the past will be executed in the next * timer tick. */ void add_timer(struct timer_list *timer) { BUG_ON(timer_pending(timer)); __mod_timer(timer, timer->expires, MOD_TIMER_NOTPENDING); } EXPORT_SYMBOL(add_timer); /** * add_timer_on - start a timer on a particular CPU * @timer: the timer to be added * @cpu: the CPU to start it on * * This is not very scalable on SMP. Double adds are not possible. */ void add_timer_on(struct timer_list *timer, int cpu) { struct timer_base *new_base, *base; unsigned long flags; BUG_ON(timer_pending(timer) || !timer->function); new_base = get_timer_cpu_base(timer->flags, cpu); /* * If @timer was on a different CPU, it should be migrated with the * old base locked to prevent other operations proceeding with the * wrong base locked. See lock_timer_base(). */ base = lock_timer_base(timer, &flags); if (base != new_base) { timer->flags |= TIMER_MIGRATING; raw_spin_unlock(&base->lock); base = new_base; raw_spin_lock(&base->lock); WRITE_ONCE(timer->flags, (timer->flags & ~TIMER_BASEMASK) | cpu); } forward_timer_base(base); debug_timer_activate(timer); internal_add_timer(base, timer); raw_spin_unlock_irqrestore(&base->lock, flags); } EXPORT_SYMBOL_GPL(add_timer_on); /** * del_timer - deactivate a timer. * @timer: the timer to be deactivated * * del_timer() deactivates a timer - this works on both active and inactive * timers. * * The function returns whether it has deactivated a pending timer or not. * (ie. del_timer() of an inactive timer returns 0, del_timer() of an * active timer returns 1.) */ int del_timer(struct timer_list *timer) { struct timer_base *base; unsigned long flags; int ret = 0; debug_assert_init(timer); if (timer_pending(timer)) { base = lock_timer_base(timer, &flags); ret = detach_if_pending(timer, base, true); raw_spin_unlock_irqrestore(&base->lock, flags); } return ret; } EXPORT_SYMBOL(del_timer); /** * try_to_del_timer_sync - Try to deactivate a timer * @timer: timer to delete * * This function tries to deactivate a timer. Upon successful (ret >= 0) * exit the timer is not queued and the handler is not running on any CPU. */ int try_to_del_timer_sync(struct timer_list *timer) { struct timer_base *base; unsigned long flags; int ret = -1; debug_assert_init(timer); base = lock_timer_base(timer, &flags); if (base->running_timer != timer) ret = detach_if_pending(timer, base, true); raw_spin_unlock_irqrestore(&base->lock, flags); return ret; } EXPORT_SYMBOL(try_to_del_timer_sync); #ifdef CONFIG_PREEMPT_RT static __init void timer_base_init_expiry_lock(struct timer_base *base) { spin_lock_init(&base->expiry_lock); } static inline void timer_base_lock_expiry(struct timer_base *base) { spin_lock(&base->expiry_lock); } static inline void timer_base_unlock_expiry(struct timer_base *base) { spin_unlock(&base->expiry_lock); } /* * The counterpart to del_timer_wait_running(). * * If there is a waiter for base->expiry_lock, then it was waiting for the * timer callback to finish. Drop expiry_lock and reaquire it. That allows * the waiter to acquire the lock and make progress. */ static void timer_sync_wait_running(struct timer_base *base) { if (atomic_read(&base->timer_waiters)) { raw_spin_unlock_irq(&base->lock); spin_unlock(&base->expiry_lock); spin_lock(&base->expiry_lock); raw_spin_lock_irq(&base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion, if the softirq thread on a remote CPU * got preempted, and it prevents a life lock when the task which tries to * delete a timer preempted the softirq thread running the timer callback * function. */ static void del_timer_wait_running(struct timer_list *timer) { u32 tf; tf = READ_ONCE(timer->flags); if (!(tf & TIMER_MIGRATING)) { struct timer_base *base = get_timer_base(tf); /* * Mark the base as contended and grab the expiry lock, * which is held by the softirq across the timer * callback. Drop the lock immediately so the softirq can * expire the next timer. In theory the timer could already * be running again, but that's more than unlikely and just * causes another wait loop. */ atomic_inc(&base->timer_waiters); spin_lock_bh(&base->expiry_lock); atomic_dec(&base->timer_waiters); spin_unlock_bh(&base->expiry_lock); } } #else static inline void timer_base_init_expiry_lock(struct timer_base *base) { } static inline void timer_base_lock_expiry(struct timer_base *base) { } static inline void timer_base_unlock_expiry(struct timer_base *base) { } static inline void timer_sync_wait_running(struct timer_base *base) { } static inline void del_timer_wait_running(struct timer_list *timer) { } #endif #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) /** * del_timer_sync - deactivate a timer and wait for the handler to finish. * @timer: the timer to be deactivated * * This function only differs from del_timer() on SMP: besides deactivating * the timer it also makes sure the handler has finished executing on other * CPUs. * * Synchronization rules: Callers must prevent restarting of the timer, * otherwise this function is meaningless. It must not be called from * interrupt contexts unless the timer is an irqsafe one. The caller must * not hold locks which would prevent completion of the timer's * handler. The timer's handler must not call add_timer_on(). Upon exit the * timer is not queued and the handler is not running on any CPU. * * Note: For !irqsafe timers, you must not hold locks that are held in * interrupt context while calling this function. Even if the lock has * nothing to do with the timer in question. Here's why:: * * CPU0 CPU1 * ---- ---- * <SOFTIRQ> * call_timer_fn(); * base->running_timer = mytimer; * spin_lock_irq(somelock); * <IRQ> * spin_lock(somelock); * del_timer_sync(mytimer); * while (base->running_timer == mytimer); * * Now del_timer_sync() will never return and never release somelock. * The interrupt on the other CPU is waiting to grab somelock but * it has interrupted the softirq that CPU0 is waiting to finish. * * The function returns whether it has deactivated a pending timer or not. */ int del_timer_sync(struct timer_list *timer) { int ret; #ifdef CONFIG_LOCKDEP unsigned long flags; /* * If lockdep gives a backtrace here, please reference * the synchronization rules above. */ local_irq_save(flags); lock_map_acquire(&timer->lockdep_map); lock_map_release(&timer->lockdep_map); local_irq_restore(flags); #endif /* * don't use it in hardirq context, because it * could lead to deadlock. */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); do { ret = try_to_del_timer_sync(timer); if (unlikely(ret < 0)) { del_timer_wait_running(timer); cpu_relax(); } } while (ret < 0); return ret; } EXPORT_SYMBOL(del_timer_sync); #endif static void call_timer_fn(struct timer_list *timer, void (*fn)(struct timer_list *), unsigned long baseclk) { int count = preempt_count(); #ifdef CONFIG_LOCKDEP /* * It is permissible to free the timer from inside the * function that is called from it, this we need to take into * account for lockdep too. To avoid bogus "held lock freed" * warnings as well as problems when looking into * timer->lockdep_map, make a copy and use that here. */ struct lockdep_map lockdep_map; lockdep_copy_map(&lockdep_map, &timer->lockdep_map); #endif /* * Couple the lock chain with the lock chain at * del_timer_sync() by acquiring the lock_map around the fn() * call here and in del_timer_sync(). */ lock_map_acquire(&lockdep_map); trace_timer_expire_entry(timer, baseclk); fn(timer); trace_timer_expire_exit(timer); lock_map_release(&lockdep_map); if (count != preempt_count()) { WARN_ONCE(1, "timer: %pS preempt leak: %08x -> %08x\n", fn, count, preempt_count()); /* * Restore the preempt count. That gives us a decent * chance to survive and extract information. If the * callback kept a lock held, bad luck, but not worse * than the BUG() we had. */ preempt_count_set(count); } } static void expire_timers(struct timer_base *base, struct hlist_head *head) { /* * This value is required only for tracing. base->clk was * incremented directly before expire_timers was called. But expiry * is related to the old base->clk value. */ unsigned long baseclk = base->clk - 1; while (!hlist_empty(head)) { struct timer_list *timer; void (*fn)(struct timer_list *); timer = hlist_entry(head->first, struct timer_list, entry); base->running_timer = timer; detach_timer(timer, true); fn = timer->function; if (timer->flags & TIMER_IRQSAFE) { raw_spin_unlock(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock(&base->lock); base->running_timer = NULL; } else { raw_spin_unlock_irq(&base->lock); call_timer_fn(timer, fn, baseclk); raw_spin_lock_irq(&base->lock); base->running_timer = NULL; timer_sync_wait_running(base); } } } static int collect_expired_timers(struct timer_base *base, struct hlist_head *heads) { unsigned long clk = base->clk = base->next_expiry; struct hlist_head *vec; int i, levels = 0; unsigned int idx; for (i = 0; i < LVL_DEPTH; i++) { idx = (clk & LVL_MASK) + i * LVL_SIZE; if (__test_and_clear_bit(idx, base->pending_map)) { vec = base->vectors + idx; hlist_move_list(vec, heads++); levels++; } /* Is it time to look at the next level? */ if (clk & LVL_CLK_MASK) break; /* Shift clock for the next level granularity */ clk >>= LVL_CLK_SHIFT; } return levels; } /* * Find the next pending bucket of a level. Search from level start (@offset) * + @clk upwards and if nothing there, search from start of the level * (@offset) up to @offset + clk. */ static int next_pending_bucket(struct timer_base *base, unsigned offset, unsigned clk) { unsigned pos, start = offset + clk; unsigned end = offset + LVL_SIZE; pos = find_next_bit(base->pending_map, end, start); if (pos < end) return pos - start; pos = find_next_bit(base->pending_map, start, offset); return pos < start ? pos + LVL_SIZE - start : -1; } /* * Search the first expiring timer in the various clock levels. Caller must * hold base->lock. */ static unsigned long __next_timer_interrupt(struct timer_base *base) { unsigned long clk, next, adj; unsigned lvl, offset = 0; next = base->clk + NEXT_TIMER_MAX_DELTA; clk = base->clk; for (lvl = 0; lvl < LVL_DEPTH; lvl++, offset += LVL_SIZE) { int pos = next_pending_bucket(base, offset, clk & LVL_MASK); unsigned long lvl_clk = clk & LVL_CLK_MASK; if (pos >= 0) { unsigned long tmp = clk + (unsigned long) pos; tmp <<= LVL_SHIFT(lvl); if (time_before(tmp, next)) next = tmp; /* * If the next expiration happens before we reach * the next level, no need to check further. */ if (pos <= ((LVL_CLK_DIV - lvl_clk) & LVL_CLK_MASK)) break; } /* * Clock for the next level. If the current level clock lower * bits are zero, we look at the next level as is. If not we * need to advance it by one because that's going to be the * next expiring bucket in that level. base->clk is the next * expiring jiffie. So in case of: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 0 * * we have to look at all levels @index 0. With * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 0 2 * * LVL0 has the next expiring bucket @index 2. The upper * levels have the next expiring bucket @index 1. * * In case that the propagation wraps the next level the same * rules apply: * * LVL5 LVL4 LVL3 LVL2 LVL1 LVL0 * 0 0 0 0 F 2 * * So after looking at LVL0 we get: * * LVL5 LVL4 LVL3 LVL2 LVL1 * 0 0 0 1 0 * * So no propagation from LVL1 to LVL2 because that happened * with the add already, but then we need to propagate further * from LVL2 to LVL3. * * So the simple check whether the lower bits of the current * level are 0 or not is sufficient for all cases. */ adj = lvl_clk ? 1 : 0; clk >>= LVL_CLK_SHIFT; clk += adj; } base->next_expiry_recalc = false; base->timers_pending = !(next == base->clk + NEXT_TIMER_MAX_DELTA); return next; } #ifdef CONFIG_NO_HZ_COMMON /* * Check, if the next hrtimer event is before the next timer wheel * event: */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { u64 nextevt = hrtimer_get_next_event(); /* * If high resolution timers are enabled * hrtimer_get_next_event() returns KTIME_MAX. */ if (expires <= nextevt) return expires; /* * If the next timer is already expired, return the tick base * time so the tick is fired immediately. */ if (nextevt <= basem) return basem; /* * Round up to the next jiffie. High resolution timers are * off, so the hrtimers are expired in the tick and we need to * make sure that this tick really expires the timer to avoid * a ping pong of the nohz stop code. * * Use DIV_ROUND_UP_ULL to prevent gcc calling __divdi3 */ return DIV_ROUND_UP_ULL(nextevt, TICK_NSEC) * TICK_NSEC; } /** * get_next_timer_interrupt - return the time (clock mono) of the next timer * @basej: base time jiffies * @basem: base time clock monotonic * * Returns the tick aligned clock monotonic time of the next pending * timer or KTIME_MAX if no timer is pending. */ u64 get_next_timer_interrupt(unsigned long basej, u64 basem) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); u64 expires = KTIME_MAX; unsigned long nextevt; /* * Pretend that there is no timer pending if the cpu is offline. * Possible pending timers will be migrated later to an active cpu. */ if (cpu_is_offline(smp_processor_id())) return expires; raw_spin_lock(&base->lock); if (base->next_expiry_recalc) base->next_expiry = __next_timer_interrupt(base); nextevt = base->next_expiry; /* * We have a fresh next event. Check whether we can forward the * base. We can only do that when @basej is past base->clk * otherwise we might rewind base->clk. */ if (time_after(basej, base->clk)) { if (time_after(nextevt, basej)) base->clk = basej; else if (time_after(nextevt, base->clk)) base->clk = nextevt; } if (time_before_eq(nextevt, basej)) { expires = basem; base->is_idle = false; } else { if (base->timers_pending) expires = basem + (u64)(nextevt - basej) * TICK_NSEC; /* * If we expect to sleep more than a tick, mark the base idle. * Also the tick is stopped so any added timer must forward * the base clk itself to keep granularity small. This idle * logic is only maintained for the BASE_STD base, deferrable * timers may still see large granularity skew (by design). */ if ((expires - basem) > TICK_NSEC) base->is_idle = true; } raw_spin_unlock(&base->lock); return cmp_next_hrtimer_event(basem, expires); } /** * timer_clear_idle - Clear the idle state of the timer base * * Called with interrupts disabled */ void timer_clear_idle(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); /* * We do this unlocked. The worst outcome is a remote enqueue sending * a pointless IPI, but taking the lock would just make the window for * sending the IPI a few instructions smaller for the cost of taking * the lock in the exit from idle path. */ base->is_idle = false; } #endif /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. */ void update_process_times(int user_tick) { struct task_struct *p = current; PRANDOM_ADD_NOISE(jiffies, user_tick, p, 0); /* Note: this timer irq context must be accounted for as well. */ account_process_tick(p, user_tick); run_local_timers(); rcu_sched_clock_irq(user_tick); #ifdef CONFIG_IRQ_WORK if (in_irq()) irq_work_tick(); #endif scheduler_tick(); if (IS_ENABLED(CONFIG_POSIX_TIMERS)) run_posix_cpu_timers(); } /** * __run_timers - run all expired timers (if any) on this CPU. * @base: the timer vector to be processed. */ static inline void __run_timers(struct timer_base *base) { struct hlist_head heads[LVL_DEPTH]; int levels; if (time_before(jiffies, base->next_expiry)) return; timer_base_lock_expiry(base); raw_spin_lock_irq(&base->lock); while (time_after_eq(jiffies, base->clk) && time_after_eq(jiffies, base->next_expiry)) { levels = collect_expired_timers(base, heads); /* * The only possible reason for not finding any expired * timer at this clk is that all matching timers have been * dequeued. */ WARN_ON_ONCE(!levels && !base->next_expiry_recalc); base->clk++; base->next_expiry = __next_timer_interrupt(base); while (levels--) expire_timers(base, heads + levels); } raw_spin_unlock_irq(&base->lock); timer_base_unlock_expiry(base); } /* * This function runs timers and the timer-tq in bottom half context. */ static __latent_entropy void run_timer_softirq(struct softirq_action *h) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); __run_timers(base); if (IS_ENABLED(CONFIG_NO_HZ_COMMON)) __run_timers(this_cpu_ptr(&timer_bases[BASE_DEF])); } /* * Called by the local, per-CPU timer interrupt on SMP. */ void run_local_timers(void) { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); hrtimer_run_queues(); /* Raise the softirq only if required. */ if (time_before(jiffies, base->next_expiry)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON)) return; /* CPU is awake, so check the deferrable base. */ base++; if (time_before(jiffies, base->next_expiry)) return; } raise_softirq(TIMER_SOFTIRQ); } /* * Since schedule_timeout()'s timer is defined on the stack, it must store * the target task on the stack as well. */ struct process_timer { struct timer_list timer; struct task_struct *task; }; static void process_timeout(struct timer_list *t) { struct process_timer *timeout = from_timer(timeout, t, timer); wake_up_process(timeout->task); } /** * schedule_timeout - sleep until timeout * @timeout: timeout value in jiffies * * Make the current task sleep until @timeout jiffies have elapsed. * The function behavior depends on the current task state * (see also set_current_state() description): * * %TASK_RUNNING - the scheduler is called, but the task does not sleep * at all. That happens because sched_submit_work() does nothing for * tasks in %TASK_RUNNING state. * * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be %TASK_RUNNING when this * routine returns. * * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule * the CPU away without a bound on the timeout. In this case the return * value will be %MAX_SCHEDULE_TIMEOUT. * * Returns 0 when the timer has expired otherwise the remaining time in * jiffies will be returned. In all cases the return value is guaranteed * to be non-negative. */ signed long __sched schedule_timeout(signed long timeout) { struct process_timer timer; unsigned long expire; switch (timeout) { case MAX_SCHEDULE_TIMEOUT: /* * These two special cases are useful to be comfortable * in the caller. Nothing more. We could take * MAX_SCHEDULE_TIMEOUT from one of the negative value * but I' d like to return a valid offset (>=0) to allow * the caller to do everything it want with the retval. */ schedule(); goto out; default: /* * Another bit of PARANOID. Note that the retval will be * 0 since no piece of kernel is supposed to do a check * for a negative retval of schedule_timeout() (since it * should never happens anyway). You just have the printk() * that will tell you if something is gone wrong and where. */ if (timeout < 0) { printk(KERN_ERR "schedule_timeout: wrong timeout " "value %lx\n", timeout); dump_stack(); current->state = TASK_RUNNING; goto out; } } expire = timeout + jiffies; timer.task = current; timer_setup_on_stack(&timer.timer, process_timeout, 0); __mod_timer(&timer.timer, expire, MOD_TIMER_NOTPENDING); schedule(); del_singleshot_timer_sync(&timer.timer); /* Remove the timer from the object tracker */ destroy_timer_on_stack(&timer.timer); timeout = expire - jiffies; out: return timeout < 0 ? 0 : timeout; } EXPORT_SYMBOL(schedule_timeout); /* * We can use __set_current_state() here because schedule_timeout() calls * schedule() unconditionally. */ signed long __sched schedule_timeout_interruptible(signed long timeout) { __set_current_state(TASK_INTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_interruptible); signed long __sched schedule_timeout_killable(signed long timeout) { __set_current_state(TASK_KILLABLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_killable); signed long __sched schedule_timeout_uninterruptible(signed long timeout) { __set_current_state(TASK_UNINTERRUPTIBLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_uninterruptible); /* * Like schedule_timeout_uninterruptible(), except this task will not contribute * to load average. */ signed long __sched schedule_timeout_idle(signed long timeout) { __set_current_state(TASK_IDLE); return schedule_timeout(timeout); } EXPORT_SYMBOL(schedule_timeout_idle); #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct timer_base *new_base, struct hlist_head *head) { struct timer_list *timer; int cpu = new_base->cpu; while (!hlist_empty(head)) { timer = hlist_entry(head->first, struct timer_list, entry); detach_timer(timer, false); timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; internal_add_timer(new_base, timer); } } int timers_prepare_cpu(unsigned int cpu) { struct timer_base *base; int b; for (b = 0; b < NR_BASES; b++) { base = per_cpu_ptr(&timer_bases[b], cpu); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; base->timers_pending = false; base->is_idle = false; } return 0; } int timers_dead_cpu(unsigned int cpu) { struct timer_base *old_base; struct timer_base *new_base; int b, i; BUG_ON(cpu_online(cpu)); for (b = 0; b < NR_BASES; b++) { old_base = per_cpu_ptr(&timer_bases[b], cpu); new_base = get_cpu_ptr(&timer_bases[b]); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock_irq(&new_base->lock); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); /* * The current CPUs base clock might be stale. Update it * before moving the timers over. */ forward_timer_base(new_base); BUG_ON(old_base->running_timer); for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); raw_spin_unlock(&old_base->lock); raw_spin_unlock_irq(&new_base->lock); put_cpu_ptr(&timer_bases); } return 0; } #endif /* CONFIG_HOTPLUG_CPU */ static void __init init_timer_cpu(int cpu) { struct timer_base *base; int i; for (i = 0; i < NR_BASES; i++) { base = per_cpu_ptr(&timer_bases[i], cpu); base->cpu = cpu; raw_spin_lock_init(&base->lock); base->clk = jiffies; base->next_expiry = base->clk + NEXT_TIMER_MAX_DELTA; timer_base_init_expiry_lock(base); } } static void __init init_timer_cpus(void) { int cpu; for_each_possible_cpu(cpu) init_timer_cpu(cpu); } void __init init_timers(void) { init_timer_cpus(); posix_cputimers_init_work(); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } /** * msleep - sleep safely even with waitqueue interruptions * @msecs: Time in milliseconds to sleep for */ void msleep(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; while (timeout) timeout = schedule_timeout_uninterruptible(timeout); } EXPORT_SYMBOL(msleep); /** * msleep_interruptible - sleep waiting for signals * @msecs: Time in milliseconds to sleep for */ unsigned long msleep_interruptible(unsigned int msecs) { unsigned long timeout = msecs_to_jiffies(msecs) + 1; while (timeout && !signal_pending(current)) timeout = schedule_timeout_interruptible(timeout); return jiffies_to_msecs(timeout); } EXPORT_SYMBOL(msleep_interruptible); /** * usleep_range - Sleep for an approximate time * @min: Minimum time in usecs to sleep * @max: Maximum time in usecs to sleep * * In non-atomic context where the exact wakeup time is flexible, use * usleep_range() instead of udelay(). The sleep improves responsiveness * by avoiding the CPU-hogging busy-wait of udelay(), and the range reduces * power usage by allowing hrtimers to take advantage of an already- * scheduled interrupt instead of scheduling a new one just for this sleep. */ void __sched usleep_range(unsigned long min, unsigned long max) { ktime_t exp = ktime_add_us(ktime_get(), min); u64 delta = (u64)(max - min) * NSEC_PER_USEC; for (;;) { __set_current_state(TASK_UNINTERRUPTIBLE); /* Do not return before the requested sleep time has elapsed */ if (!schedule_hrtimeout_range(&exp, delta, HRTIMER_MODE_ABS)) break; } } EXPORT_SYMBOL(usleep_range);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ #include <linux/list.h> #include <linux/fsnotify.h> #include <linux/srcu.h> #include <linux/types.h> #include "../mount.h" static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct inode, i_fsnotify_marks); } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct super_block, s_fsnotify_marks); } /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); /* Destroy all marks attached to an object via connector */ extern void fsnotify_destroy_marks(fsnotify_connp_t *connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); /* allocate and destroy and event holder to attach events to notification/access queues */ extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 /* SPDX-License-Identifier: GPL-2.0 */ /* * workqueue.h --- work queue handling for Linux. */ #ifndef _LINUX_WORKQUEUE_H #define _LINUX_WORKQUEUE_H #include <linux/timer.h> #include <linux/linkage.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/threads.h> #include <linux/atomic.h> #include <linux/cpumask.h> #include <linux/rcupdate.h> struct workqueue_struct; struct work_struct; typedef void (*work_func_t)(struct work_struct *work); void delayed_work_timer_fn(struct timer_list *t); /* * The first word is the work queue pointer and the flags rolled into * one */ #define work_data_bits(work) ((unsigned long *)(&(work)->data)) enum { WORK_STRUCT_PENDING_BIT = 0, /* work item is pending execution */ WORK_STRUCT_DELAYED_BIT = 1, /* work item is delayed */ WORK_STRUCT_PWQ_BIT = 2, /* data points to pwq */ WORK_STRUCT_LINKED_BIT = 3, /* next work is linked to this one */ #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC_BIT = 4, /* static initializer (debugobjects) */ WORK_STRUCT_COLOR_SHIFT = 5, /* color for workqueue flushing */ #else WORK_STRUCT_COLOR_SHIFT = 4, /* color for workqueue flushing */ #endif WORK_STRUCT_COLOR_BITS = 4, WORK_STRUCT_PENDING = 1 << WORK_STRUCT_PENDING_BIT, WORK_STRUCT_DELAYED = 1 << WORK_STRUCT_DELAYED_BIT, WORK_STRUCT_PWQ = 1 << WORK_STRUCT_PWQ_BIT, WORK_STRUCT_LINKED = 1 << WORK_STRUCT_LINKED_BIT, #ifdef CONFIG_DEBUG_OBJECTS_WORK WORK_STRUCT_STATIC = 1 << WORK_STRUCT_STATIC_BIT, #else WORK_STRUCT_STATIC = 0, #endif /* * The last color is no color used for works which don't * participate in workqueue flushing. */ WORK_NR_COLORS = (1 << WORK_STRUCT_COLOR_BITS) - 1, WORK_NO_COLOR = WORK_NR_COLORS, /* not bound to any CPU, prefer the local CPU */ WORK_CPU_UNBOUND = NR_CPUS, /* * Reserve 8 bits off of pwq pointer w/ debugobjects turned off. * This makes pwqs aligned to 256 bytes and allows 15 workqueue * flush colors. */ WORK_STRUCT_FLAG_BITS = WORK_STRUCT_COLOR_SHIFT + WORK_STRUCT_COLOR_BITS, /* data contains off-queue information when !WORK_STRUCT_PWQ */ WORK_OFFQ_FLAG_BASE = WORK_STRUCT_COLOR_SHIFT, __WORK_OFFQ_CANCELING = WORK_OFFQ_FLAG_BASE, WORK_OFFQ_CANCELING = (1 << __WORK_OFFQ_CANCELING), /* * When a work item is off queue, its high bits point to the last * pool it was on. Cap at 31 bits and use the highest number to * indicate that no pool is associated. */ WORK_OFFQ_FLAG_BITS = 1, WORK_OFFQ_POOL_SHIFT = WORK_OFFQ_FLAG_BASE + WORK_OFFQ_FLAG_BITS, WORK_OFFQ_LEFT = BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT, WORK_OFFQ_POOL_BITS = WORK_OFFQ_LEFT <= 31 ? WORK_OFFQ_LEFT : 31, WORK_OFFQ_POOL_NONE = (1LU << WORK_OFFQ_POOL_BITS) - 1, /* convenience constants */ WORK_STRUCT_FLAG_MASK = (1UL << WORK_STRUCT_FLAG_BITS) - 1, WORK_STRUCT_WQ_DATA_MASK = ~WORK_STRUCT_FLAG_MASK, WORK_STRUCT_NO_POOL = (unsigned long)WORK_OFFQ_POOL_NONE << WORK_OFFQ_POOL_SHIFT, /* bit mask for work_busy() return values */ WORK_BUSY_PENDING = 1 << 0, WORK_BUSY_RUNNING = 1 << 1, /* maximum string length for set_worker_desc() */ WORKER_DESC_LEN = 24, }; struct work_struct { atomic_long_t data; struct list_head entry; work_func_t func; #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif }; #define WORK_DATA_INIT() ATOMIC_LONG_INIT((unsigned long)WORK_STRUCT_NO_POOL) #define WORK_DATA_STATIC_INIT() \ ATOMIC_LONG_INIT((unsigned long)(WORK_STRUCT_NO_POOL | WORK_STRUCT_STATIC)) struct delayed_work { struct work_struct work; struct timer_list timer; /* target workqueue and CPU ->timer uses to queue ->work */ struct workqueue_struct *wq; int cpu; }; struct rcu_work { struct work_struct work; struct rcu_head rcu; /* target workqueue ->rcu uses to queue ->work */ struct workqueue_struct *wq; }; /** * struct workqueue_attrs - A struct for workqueue attributes. * * This can be used to change attributes of an unbound workqueue. */ struct workqueue_attrs { /** * @nice: nice level */ int nice; /** * @cpumask: allowed CPUs */ cpumask_var_t cpumask; /** * @no_numa: disable NUMA affinity * * Unlike other fields, ``no_numa`` isn't a property of a worker_pool. It * only modifies how :c:func:`apply_workqueue_attrs` select pools and thus * doesn't participate in pool hash calculations or equality comparisons. */ bool no_numa; }; static inline struct delayed_work *to_delayed_work(struct work_struct *work) { return container_of(work, struct delayed_work, work); } static inline struct rcu_work *to_rcu_work(struct work_struct *work) { return container_of(work, struct rcu_work, work); } struct execute_work { struct work_struct work; }; #ifdef CONFIG_LOCKDEP /* * NB: because we have to copy the lockdep_map, setting _key * here is required, otherwise it could get initialised to the * copy of the lockdep_map! */ #define __WORK_INIT_LOCKDEP_MAP(n, k) \ .lockdep_map = STATIC_LOCKDEP_MAP_INIT(n, k), #else #define __WORK_INIT_LOCKDEP_MAP(n, k) #endif #define __WORK_INITIALIZER(n, f) { \ .data = WORK_DATA_STATIC_INIT(), \ .entry = { &(n).entry, &(n).entry }, \ .func = (f), \ __WORK_INIT_LOCKDEP_MAP(#n, &(n)) \ } #define __DELAYED_WORK_INITIALIZER(n, f, tflags) { \ .work = __WORK_INITIALIZER((n).work, (f)), \ .timer = __TIMER_INITIALIZER(delayed_work_timer_fn,\ (tflags) | TIMER_IRQSAFE), \ } #define DECLARE_WORK(n, f) \ struct work_struct n = __WORK_INITIALIZER(n, f) #define DECLARE_DELAYED_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, 0) #define DECLARE_DEFERRABLE_WORK(n, f) \ struct delayed_work n = __DELAYED_WORK_INITIALIZER(n, f, TIMER_DEFERRABLE) #ifdef CONFIG_DEBUG_OBJECTS_WORK extern void __init_work(struct work_struct *work, int onstack); extern void destroy_work_on_stack(struct work_struct *work); extern void destroy_delayed_work_on_stack(struct delayed_work *work); static inline unsigned int work_static(struct work_struct *work) { return *work_data_bits(work) & WORK_STRUCT_STATIC; } #else static inline void __init_work(struct work_struct *work, int onstack) { } static inline void destroy_work_on_stack(struct work_struct *work) { } static inline void destroy_delayed_work_on_stack(struct delayed_work *work) { } static inline unsigned int work_static(struct work_struct *work) { return 0; } #endif /* * initialize all of a work item in one go * * NOTE! No point in using "atomic_long_set()": using a direct * assignment of the work data initializer allows the compiler * to generate better code. */ #ifdef CONFIG_LOCKDEP #define __INIT_WORK(_work, _func, _onstack) \ do { \ static struct lock_class_key __key; \ \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ lockdep_init_map(&(_work)->lockdep_map, "(work_completion)"#_work, &__key, 0); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #else #define __INIT_WORK(_work, _func, _onstack) \ do { \ __init_work((_work), _onstack); \ (_work)->data = (atomic_long_t) WORK_DATA_INIT(); \ INIT_LIST_HEAD(&(_work)->entry); \ (_work)->func = (_func); \ } while (0) #endif #define INIT_WORK(_work, _func) \ __INIT_WORK((_work), (_func), 0) #define INIT_WORK_ONSTACK(_work, _func) \ __INIT_WORK((_work), (_func), 1) #define __INIT_DELAYED_WORK(_work, _func, _tflags) \ do { \ INIT_WORK(&(_work)->work, (_func)); \ __init_timer(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define __INIT_DELAYED_WORK_ONSTACK(_work, _func, _tflags) \ do { \ INIT_WORK_ONSTACK(&(_work)->work, (_func)); \ __init_timer_on_stack(&(_work)->timer, \ delayed_work_timer_fn, \ (_tflags) | TIMER_IRQSAFE); \ } while (0) #define INIT_DELAYED_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, 0) #define INIT_DELAYED_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, 0) #define INIT_DEFERRABLE_WORK(_work, _func) \ __INIT_DELAYED_WORK(_work, _func, TIMER_DEFERRABLE) #define INIT_DEFERRABLE_WORK_ONSTACK(_work, _func) \ __INIT_DELAYED_WORK_ONSTACK(_work, _func, TIMER_DEFERRABLE) #define INIT_RCU_WORK(_work, _func) \ INIT_WORK(&(_work)->work, (_func)) #define INIT_RCU_WORK_ONSTACK(_work, _func) \ INIT_WORK_ONSTACK(&(_work)->work, (_func)) /** * work_pending - Find out whether a work item is currently pending * @work: The work item in question */ #define work_pending(work) \ test_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(work)) /** * delayed_work_pending - Find out whether a delayable work item is currently * pending * @w: The work item in question */ #define delayed_work_pending(w) \ work_pending(&(w)->work) /* * Workqueue flags and constants. For details, please refer to * Documentation/core-api/workqueue.rst. */ enum { WQ_UNBOUND = 1 << 1, /* not bound to any cpu */ WQ_FREEZABLE = 1 << 2, /* freeze during suspend */ WQ_MEM_RECLAIM = 1 << 3, /* may be used for memory reclaim */ WQ_HIGHPRI = 1 << 4, /* high priority */ WQ_CPU_INTENSIVE = 1 << 5, /* cpu intensive workqueue */ WQ_SYSFS = 1 << 6, /* visible in sysfs, see wq_sysfs_register() */ /* * Per-cpu workqueues are generally preferred because they tend to * show better performance thanks to cache locality. Per-cpu * workqueues exclude the scheduler from choosing the CPU to * execute the worker threads, which has an unfortunate side effect * of increasing power consumption. * * The scheduler considers a CPU idle if it doesn't have any task * to execute and tries to keep idle cores idle to conserve power; * however, for example, a per-cpu work item scheduled from an * interrupt handler on an idle CPU will force the scheduler to * excute the work item on that CPU breaking the idleness, which in * turn may lead to more scheduling choices which are sub-optimal * in terms of power consumption. * * Workqueues marked with WQ_POWER_EFFICIENT are per-cpu by default * but become unbound if workqueue.power_efficient kernel param is * specified. Per-cpu workqueues which are identified to * contribute significantly to power-consumption are identified and * marked with this flag and enabling the power_efficient mode * leads to noticeable power saving at the cost of small * performance disadvantage. * * http://thread.gmane.org/gmane.linux.kernel/1480396 */ WQ_POWER_EFFICIENT = 1 << 7, __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ __WQ_LEGACY = 1 << 18, /* internal: create*_workqueue() */ __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, }; /* unbound wq's aren't per-cpu, scale max_active according to #cpus */ #define WQ_UNBOUND_MAX_ACTIVE \ max_t(int, WQ_MAX_ACTIVE, num_possible_cpus() * WQ_MAX_UNBOUND_PER_CPU) /* * System-wide workqueues which are always present. * * system_wq is the one used by schedule[_delayed]_work[_on](). * Multi-CPU multi-threaded. There are users which expect relatively * short queue flush time. Don't queue works which can run for too * long. * * system_highpri_wq is similar to system_wq but for work items which * require WQ_HIGHPRI. * * system_long_wq is similar to system_wq but may host long running * works. Queue flushing might take relatively long. * * system_unbound_wq is unbound workqueue. Workers are not bound to * any specific CPU, not concurrency managed, and all queued works are * executed immediately as long as max_active limit is not reached and * resources are available. * * system_freezable_wq is equivalent to system_wq except that it's * freezable. * * *_power_efficient_wq are inclined towards saving power and converted * into WQ_UNBOUND variants if 'wq_power_efficient' is enabled; otherwise, * they are same as their non-power-efficient counterparts - e.g. * system_power_efficient_wq is identical to system_wq if * 'wq_power_efficient' is disabled. See WQ_POWER_EFFICIENT for more info. */ extern struct workqueue_struct *system_wq; extern struct workqueue_struct *system_highpri_wq; extern struct workqueue_struct *system_long_wq; extern struct workqueue_struct *system_unbound_wq; extern struct workqueue_struct *system_freezable_wq; extern struct workqueue_struct *system_power_efficient_wq; extern struct workqueue_struct *system_freezable_power_efficient_wq; /** * alloc_workqueue - allocate a workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags * @max_active: max in-flight work items, 0 for default * remaining args: args for @fmt * * Allocate a workqueue with the specified parameters. For detailed * information on WQ_* flags, please refer to * Documentation/core-api/workqueue.rst. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ struct workqueue_struct *alloc_workqueue(const char *fmt, unsigned int flags, int max_active, ...); /** * alloc_ordered_workqueue - allocate an ordered workqueue * @fmt: printf format for the name of the workqueue * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful) * @args...: args for @fmt * * Allocate an ordered workqueue. An ordered workqueue executes at * most one work item at any given time in the queued order. They are * implemented as unbound workqueues with @max_active of one. * * RETURNS: * Pointer to the allocated workqueue on success, %NULL on failure. */ #define alloc_ordered_workqueue(fmt, flags, args...) \ alloc_workqueue(fmt, WQ_UNBOUND | __WQ_ORDERED | \ __WQ_ORDERED_EXPLICIT | (flags), 1, ##args) #define create_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, 1, (name)) #define create_freezable_workqueue(name) \ alloc_workqueue("%s", __WQ_LEGACY | WQ_FREEZABLE | WQ_UNBOUND | \ WQ_MEM_RECLAIM, 1, (name)) #define create_singlethread_workqueue(name) \ alloc_ordered_workqueue("%s", __WQ_LEGACY | WQ_MEM_RECLAIM, name) extern void destroy_workqueue(struct workqueue_struct *wq); struct workqueue_attrs *alloc_workqueue_attrs(void); void free_workqueue_attrs(struct workqueue_attrs *attrs); int apply_workqueue_attrs(struct workqueue_struct *wq, const struct workqueue_attrs *attrs); int workqueue_set_unbound_cpumask(cpumask_var_t cpumask); extern bool queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_work_node(int node, struct workqueue_struct *wq, struct work_struct *work); extern bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *work, unsigned long delay); extern bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay); extern bool queue_rcu_work(struct workqueue_struct *wq, struct rcu_work *rwork); extern void flush_workqueue(struct workqueue_struct *wq); extern void drain_workqueue(struct workqueue_struct *wq); extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work(struct delayed_work *dwork); extern bool cancel_delayed_work_sync(struct delayed_work *dwork); extern bool flush_rcu_work(struct rcu_work *rwork); extern void workqueue_set_max_active(struct workqueue_struct *wq, int max_active); extern struct work_struct *current_work(void); extern bool current_is_workqueue_rescuer(void); extern bool workqueue_congested(int cpu, struct workqueue_struct *wq); extern unsigned int work_busy(struct work_struct *work); extern __printf(1, 2) void set_worker_desc(const char *fmt, ...); extern void print_worker_info(const char *log_lvl, struct task_struct *task); extern void show_workqueue_state(void); extern void wq_worker_comm(char *buf, size_t size, struct task_struct *task); /** * queue_work - queue work on a workqueue * @wq: workqueue to use * @work: work to queue * * Returns %false if @work was already on a queue, %true otherwise. * * We queue the work to the CPU on which it was submitted, but if the CPU dies * it can be processed by another CPU. * * Memory-ordering properties: If it returns %true, guarantees that all stores * preceding the call to queue_work() in the program order will be visible from * the CPU which will execute @work by the time such work executes, e.g., * * { x is initially 0 } * * CPU0 CPU1 * * WRITE_ONCE(x, 1); [ @work is being executed ] * r0 = queue_work(wq, work); r1 = READ_ONCE(x); * * Forbids: r0 == true && r1 == 0 */ static inline bool queue_work(struct workqueue_struct *wq, struct work_struct *work) { return queue_work_on(WORK_CPU_UNBOUND, wq, work); } /** * queue_delayed_work - queue work on a workqueue after delay * @wq: workqueue to use * @dwork: delayable work to queue * @delay: number of jiffies to wait before queueing * * Equivalent to queue_delayed_work_on() but tries to use the local CPU. */ static inline bool queue_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * mod_delayed_work - modify delay of or queue a delayed work * @wq: workqueue to use * @dwork: work to queue * @delay: number of jiffies to wait before queueing * * mod_delayed_work_on() on local CPU. */ static inline bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); } /** * schedule_work_on - put work task on a specific cpu * @cpu: cpu to put the work task on * @work: job to be done * * This puts a job on a specific cpu */ static inline bool schedule_work_on(int cpu, struct work_struct *work) { return queue_work_on(cpu, system_wq, work); } /** * schedule_work - put work task in global workqueue * @work: job to be done * * Returns %false if @work was already on the kernel-global workqueue and * %true otherwise. * * This puts a job in the kernel-global workqueue if it was not already * queued and leaves it in the same position on the kernel-global * workqueue otherwise. * * Shares the same memory-ordering properties of queue_work(), cf. the * DocBook header of queue_work(). */ static inline bool schedule_work(struct work_struct *work) { return queue_work(system_wq, work); } /** * flush_scheduled_work - ensure that any scheduled work has run to completion. * * Forces execution of the kernel-global workqueue and blocks until its * completion. * * Think twice before calling this function! It's very easy to get into * trouble if you don't take great care. Either of the following situations * will lead to deadlock: * * One of the work items currently on the workqueue needs to acquire * a lock held by your code or its caller. * * Your code is running in the context of a work routine. * * They will be detected by lockdep when they occur, but the first might not * occur very often. It depends on what work items are on the workqueue and * what locks they need, which you have no control over. * * In most situations flushing the entire workqueue is overkill; you merely * need to know that a particular work item isn't queued and isn't running. * In such cases you should use cancel_delayed_work_sync() or * cancel_work_sync() instead. */ static inline void flush_scheduled_work(void) { flush_workqueue(system_wq); } /** * schedule_delayed_work_on - queue work in global workqueue on CPU after delay * @cpu: cpu to use * @dwork: job to be done * @delay: number of jiffies to wait * * After waiting for a given time this puts a job in the kernel-global * workqueue on the specified CPU. */ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work_on(cpu, system_wq, dwork, delay); } /** * schedule_delayed_work - put work task in global workqueue after delay * @dwork: job to be done * @delay: number of jiffies to wait or 0 for immediate execution * * After waiting for a given time this puts a job in the kernel-global * workqueue. */ static inline bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { return queue_delayed_work(system_wq, dwork, delay); } #ifndef CONFIG_SMP static inline long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } static inline long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg) { return fn(arg); } #else long work_on_cpu(int cpu, long (*fn)(void *), void *arg); long work_on_cpu_safe(int cpu, long (*fn)(void *), void *arg); #endif /* CONFIG_SMP */ #ifdef CONFIG_FREEZER extern void freeze_workqueues_begin(void); extern bool freeze_workqueues_busy(void); extern void thaw_workqueues(void); #endif /* CONFIG_FREEZER */ #ifdef CONFIG_SYSFS int workqueue_sysfs_register(struct workqueue_struct *wq); #else /* CONFIG_SYSFS */ static inline int workqueue_sysfs_register(struct workqueue_struct *wq) { return 0; } #endif /* CONFIG_SYSFS */ #ifdef CONFIG_WQ_WATCHDOG void wq_watchdog_touch(int cpu); #else /* CONFIG_WQ_WATCHDOG */ static inline void wq_watchdog_touch(int cpu) { } #endif /* CONFIG_WQ_WATCHDOG */ #ifdef CONFIG_SMP int workqueue_prepare_cpu(unsigned int cpu); int workqueue_online_cpu(unsigned int cpu); int workqueue_offline_cpu(unsigned int cpu); #endif void __init workqueue_init_early(void); void __init workqueue_init(void); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _NET_ETHTOOL_NETLINK_H #define _NET_ETHTOOL_NETLINK_H #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/sock.h> struct ethnl_req_info; int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *nest, struct net *net, struct netlink_ext_ack *extack, bool require_dev); int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype); struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); /** * ethnl_strz_size() - calculate attribute length for fixed size string * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Return: total length of an attribute with null terminated string from @s */ static inline int ethnl_strz_size(const char *s) { return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); } /** * ethnl_put_strz() - put string attribute with fixed size string * @skb: skb with the message * @attrype: attribute type * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Puts an attribute with null terminated string from @s into the message. * * Return: 0 on success, negative error code on failure */ static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, const char *s) { unsigned int len = strnlen(s, ETH_GSTRING_LEN); struct nlattr *attr; attr = nla_reserve(skb, attrtype, len + 1); if (!attr) return -EMSGSIZE; memcpy(nla_data(attr), s, len); ((char *)nla_data(attr))[len] = '\0'; return 0; } /** * ethnl_update_u32() - update u32 value from NLA_U32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u32 value from NLA_U32 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, bool *mod) { u32 val; if (!attr) return; val = nla_get_u32(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_u8() - update u8 value from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u8 value from NLA_U8 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = nla_get_u8(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_binary() - update binary data from NLA_BINARY atribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block * of length @len at @dst by attribute payload; do nothing if @attr is null. * Bool pointed to by @mod is set to true if this function changed the logical * value of *dst, otherwise it is left as is. */ static inline void ethnl_update_binary(void *dst, unsigned int len, const struct nlattr *attr, bool *mod) { if (!attr) return; if (nla_len(attr) < len) len = nla_len(attr); if (!memcmp(dst, nla_data(attr), len)) return; memcpy(dst, nla_data(attr), len); *mod = true; } /** * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Update bits in u32 value which are set in attribute's mask to values from * attribute's value. Do nothing if @attr is null or the value wouldn't change; * otherwise, set bool pointed to by @mod to true. */ static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, bool *mod) { struct nla_bitfield32 change; u32 newval; if (!attr) return; change = nla_get_bitfield32(attr); newval = (*dst & ~change.selector) | (change.value & change.selector); if (*dst == newval) return; *dst = newval; *mod = true; } /** * ethnl_reply_header_size() - total size of reply header * * This is an upper estimate so that we do not need to hold RTNL lock longer * than necessary (to prevent rename between size estimate and composing the * message). Accounts only for device ifindex and name as those are the only * attributes ethnl_fill_reply_header() puts into the reply header. */ static inline unsigned int ethnl_reply_header_size(void) { return nla_total_size(nla_total_size(sizeof(u32)) + nla_total_size(IFNAMSIZ)); } /* GET request handling */ /* Unified processing of GET requests uses two data structures: request info * and reply data. Request info holds information parsed from client request * and its stays constant through all request processing. Reply data holds data * retrieved from ethtool_ops callbacks or other internal sources which is used * to compose the reply. When processing a dump request, request info is filled * only once (when the request message is parsed) but reply data is filled for * each reply message. * * Both structures consist of part common for all request types (struct * ethnl_req_info and struct ethnl_reply_data defined below) and optional * parts specific for each request type. Common part always starts at offset 0. */ /** * struct ethnl_req_info - base type of request information for GET requests * @dev: network device the request is for (may be null) * @flags: request flags common for all request types * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at * zero offset. */ struct ethnl_req_info { struct net_device *dev; u32 flags; }; /** * struct ethnl_reply_data - base type of reply data for GET requests * @dev: device for current reply message; in single shot requests it is * equal to &ethnl_req_info.dev; in dumps it's different for each * reply message * * This is a common base for request specific structures holding data for * kernel reply message. These always embed struct ethnl_reply_data at zero * offset. */ struct ethnl_reply_data { struct net_device *dev; }; static inline int ethnl_ops_begin(struct net_device *dev) { if (dev && dev->reg_state == NETREG_UNREGISTERING) return -ENODEV; if (dev && dev->ethtool_ops->begin) return dev->ethtool_ops->begin(dev); else return 0; } static inline void ethnl_ops_complete(struct net_device *dev) { if (dev && dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); } /** * struct ethnl_request_ops - unified handling of GET requests * @request_cmd: command id for request (GET) * @reply_cmd: command id for reply (GET_REPLY) * @hdr_attr: attribute type for request header * @req_info_size: size of request info * @reply_data_size: size of reply data * @allow_nodev_do: allow non-dump request with no device identification * @parse_request: * Parse request except common header (struct ethnl_req_info). Common * header is already filled on entry, the rest up to @repdata_offset * is zero initialized. This callback should only modify type specific * request info by parsed attributes from request message. * @prepare_data: * Retrieve and prepare data needed to compose a reply message. Calls to * ethtool_ops handlers are limited to this callback. Common reply data * (struct ethnl_reply_data) is filled on entry, type specific part after * it is zero initialized. This callback should only modify the type * specific part of reply data. Device identification from struct * ethnl_reply_data is to be used as for dump requests, it iterates * through network devices while dev member of struct ethnl_req_info * points to the device from client request. * @reply_size: * Estimate reply message size. Returned value must be sufficient for * message payload without common reply header. The callback may returned * estimate higher than actual message size if exact calculation would * not be worth the saved memory space. * @fill_reply: * Fill reply message payload (except for common header) from reply data. * The callback must not generate more payload than previously called * ->reply_size() estimated. * @cleanup_data: * Optional cleanup called when reply data is no longer needed. Can be * used e.g. to free any additional data structures outside the main * structure which were allocated by ->prepare_data(). When processing * dump requests, ->cleanup() is called for each message. * * Description of variable parts of GET request handling when using the * unified infrastructure. When used, a pointer to an instance of this * structure is to be added to &ethnl_default_requests array and generic * handlers ethnl_default_doit(), ethnl_default_dumpit(), * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; * ethnl_default_notify() can be used in @ethnl_notify_handlers to send * notifications of the corresponding type. */ struct ethnl_request_ops { u8 request_cmd; u8 reply_cmd; u16 hdr_attr; unsigned int req_info_size; unsigned int reply_data_size; bool allow_nodev_do; int (*parse_request)(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack); int (*prepare_data)(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, struct genl_info *info); int (*reply_size)(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); int (*fill_reply)(struct sk_buff *skb, const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); void (*cleanup_data)(struct ethnl_reply_data *reply_data); }; /* request handlers */ extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; extern const struct ethnl_request_ops ethnl_rings_request_ops; extern const struct ethnl_request_ops ethnl_channels_request_ops; extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1]; extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG + 1]; extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1]; extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1]; extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1]; extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1]; extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1]; extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1]; extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1]; extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info); int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _NET_ETHTOOL_NETLINK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #ifndef __BLUETOOTH_H #define __BLUETOOTH_H #include <linux/poll.h> #include <net/sock.h> #include <linux/seq_file.h> #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 #ifndef AF_BLUETOOTH #define AF_BLUETOOTH 31 #define PF_BLUETOOTH AF_BLUETOOTH #endif /* Bluetooth versions */ #define BLUETOOTH_VER_1_1 1 #define BLUETOOTH_VER_1_2 2 #define BLUETOOTH_VER_2_0 3 #define BLUETOOTH_VER_2_1 4 #define BLUETOOTH_VER_4_0 6 /* Reserv for core and drivers use */ #define BT_SKB_RESERVE 8 #define BTPROTO_L2CAP 0 #define BTPROTO_HCI 1 #define BTPROTO_SCO 2 #define BTPROTO_RFCOMM 3 #define BTPROTO_BNEP 4 #define BTPROTO_CMTP 5 #define BTPROTO_HIDP 6 #define BTPROTO_AVDTP 7 #define SOL_HCI 0 #define SOL_L2CAP 6 #define SOL_SCO 17 #define SOL_RFCOMM 18 #define BT_SECURITY 4 struct bt_security { __u8 level; __u8 key_size; }; #define BT_SECURITY_SDP 0 #define BT_SECURITY_LOW 1 #define BT_SECURITY_MEDIUM 2 #define BT_SECURITY_HIGH 3 #define BT_SECURITY_FIPS 4 #define BT_DEFER_SETUP 7 #define BT_FLUSHABLE 8 #define BT_FLUSHABLE_OFF 0 #define BT_FLUSHABLE_ON 1 #define BT_POWER 9 struct bt_power { __u8 force_active; }; #define BT_POWER_FORCE_ACTIVE_OFF 0 #define BT_POWER_FORCE_ACTIVE_ON 1 #define BT_CHANNEL_POLICY 10 /* BR/EDR only (default policy) * AMP controllers cannot be used. * Channel move requests from the remote device are denied. * If the L2CAP channel is currently using AMP, move the channel to BR/EDR. */ #define BT_CHANNEL_POLICY_BREDR_ONLY 0 /* BR/EDR Preferred * Allow use of AMP controllers. * If the L2CAP channel is currently on AMP, move it to BR/EDR. * Channel move requests from the remote device are allowed. */ #define BT_CHANNEL_POLICY_BREDR_PREFERRED 1 /* AMP Preferred * Allow use of AMP controllers * If the L2CAP channel is currently on BR/EDR and AMP controller * resources are available, initiate a channel move to AMP. * Channel move requests from the remote device are allowed. * If the L2CAP socket has not been connected yet, try to create * and configure the channel directly on an AMP controller rather * than BR/EDR. */ #define BT_CHANNEL_POLICY_AMP_PREFERRED 2 #define BT_VOICE 11 struct bt_voice { __u16 setting; }; #define BT_VOICE_TRANSPARENT 0x0003 #define BT_VOICE_CVSD_16BIT 0x0060 #define BT_SNDMTU 12 #define BT_RCVMTU 13 #define BT_PHY 14 #define BT_PHY_BR_1M_1SLOT 0x00000001 #define BT_PHY_BR_1M_3SLOT 0x00000002 #define BT_PHY_BR_1M_5SLOT 0x00000004 #define BT_PHY_EDR_2M_1SLOT 0x00000008 #define BT_PHY_EDR_2M_3SLOT 0x00000010 #define BT_PHY_EDR_2M_5SLOT 0x00000020 #define BT_PHY_EDR_3M_1SLOT 0x00000040 #define BT_PHY_EDR_3M_3SLOT 0x00000080 #define BT_PHY_EDR_3M_5SLOT 0x00000100 #define BT_PHY_LE_1M_TX 0x00000200 #define BT_PHY_LE_1M_RX 0x00000400 #define BT_PHY_LE_2M_TX 0x00000800 #define BT_PHY_LE_2M_RX 0x00001000 #define BT_PHY_LE_CODED_TX 0x00002000 #define BT_PHY_LE_CODED_RX 0x00004000 #define BT_MODE 15 #define BT_MODE_BASIC 0x00 #define BT_MODE_ERTM 0x01 #define BT_MODE_STREAMING 0x02 #define BT_MODE_LE_FLOWCTL 0x03 #define BT_MODE_EXT_FLOWCTL 0x04 #define BT_PKT_STATUS 16 #define BT_SCM_PKT_STATUS 0x03 __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) void bt_warn(const char *fmt, ...); __printf(1, 2) void bt_err(const char *fmt, ...); #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) void bt_dbg_set(bool enable); bool bt_dbg_get(void); __printf(1, 2) void bt_dbg(const char *fmt, ...); #endif __printf(1, 2) void bt_warn_ratelimited(const char *fmt, ...); __printf(1, 2) void bt_err_ratelimited(const char *fmt, ...); #define BT_INFO(fmt, ...) bt_info(fmt "\n", ##__VA_ARGS__) #define BT_WARN(fmt, ...) bt_warn(fmt "\n", ##__VA_ARGS__) #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) #define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__) #else #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif #define bt_dev_info(hdev, fmt, ...) \ BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err(hdev, fmt, ...) \ BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_dbg(hdev, fmt, ...) \ BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) /* Connection and socket states */ enum { BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */ BT_OPEN, BT_BOUND, BT_LISTEN, BT_CONNECT, BT_CONNECT2, BT_CONFIG, BT_DISCONN, BT_CLOSED }; /* If unused will be removed by compiler */ static inline const char *state_to_string(int state) { switch (state) { case BT_CONNECTED: return "BT_CONNECTED"; case BT_OPEN: return "BT_OPEN"; case BT_BOUND: return "BT_BOUND"; case BT_LISTEN: return "BT_LISTEN"; case BT_CONNECT: return "BT_CONNECT"; case BT_CONNECT2: return "BT_CONNECT2"; case BT_CONFIG: return "BT_CONFIG"; case BT_DISCONN: return "BT_DISCONN"; case BT_CLOSED: return "BT_CLOSED"; } return "invalid state"; } /* BD Address */ typedef struct { __u8 b[6]; } __packed bdaddr_t; /* BD Address type */ #define BDADDR_BREDR 0x00 #define BDADDR_LE_PUBLIC 0x01 #define BDADDR_LE_RANDOM 0x02 static inline bool bdaddr_type_is_valid(u8 type) { switch (type) { case BDADDR_BREDR: case BDADDR_LE_PUBLIC: case BDADDR_LE_RANDOM: return true; } return false; } static inline bool bdaddr_type_is_le(u8 type) { switch (type) { case BDADDR_LE_PUBLIC: case BDADDR_LE_RANDOM: return true; } return false; } #define BDADDR_ANY (&(bdaddr_t) {{0, 0, 0, 0, 0, 0}}) #define BDADDR_NONE (&(bdaddr_t) {{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}) /* Copy, swap, convert BD Address */ static inline int bacmp(const bdaddr_t *ba1, const bdaddr_t *ba2) { return memcmp(ba1, ba2, sizeof(bdaddr_t)); } static inline void bacpy(bdaddr_t *dst, const bdaddr_t *src) { memcpy(dst, src, sizeof(bdaddr_t)); } void baswap(bdaddr_t *dst, const bdaddr_t *src); /* Common socket structures and functions */ #define bt_sk(__sk) ((struct bt_sock *) __sk) struct bt_sock { struct sock sk; struct list_head accept_q; struct sock *parent; unsigned long flags; void (*skb_msg_name)(struct sk_buff *, void *, int *); void (*skb_put_cmsg)(struct sk_buff *, struct msghdr *, struct sock *); }; enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, }; struct bt_sock_list { struct hlist_head head; rwlock_t lock; #ifdef CONFIG_PROC_FS int (* custom_seq_show)(struct seq_file *, void *); #endif }; int bt_sock_register(int proto, const struct net_proto_family *ops); void bt_sock_unregister(int proto); void bt_sock_link(struct bt_sock_list *l, struct sock *s); void bt_sock_unlink(struct bt_sock_list *l, struct sock *s); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); int bt_sock_wait_ready(struct sock *sk, unsigned long flags); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh); void bt_accept_unlink(struct sock *sk); struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock); /* Skb helpers */ struct l2cap_ctrl { u8 sframe:1, poll:1, final:1, fcs:1, sar:2, super:2; u16 reqseq; u16 txseq; u8 retries; __le16 psm; bdaddr_t bdaddr; struct l2cap_chan *chan; }; struct sco_ctrl { u8 pkt_status; }; struct hci_dev; typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb); #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) struct hci_ctrl { u16 opcode; u8 req_flags; u8 req_event; union { hci_req_complete_t req_complete; hci_req_complete_skb_t req_complete_skb; }; }; struct bt_skb_cb { u8 pkt_type; u8 force_active; u16 expect; u8 incoming:1; union { struct l2cap_ctrl l2cap; struct sco_ctrl sco; struct hci_ctrl hci; }; }; #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how) { struct sk_buff *skb; skb = alloc_skb(len + BT_SKB_RESERVE, how); if (skb) skb_reserve(skb, BT_SKB_RESERVE); return skb; } static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk, unsigned long len, int nb, int *err) { struct sk_buff *skb; skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err); if (skb) skb_reserve(skb, BT_SKB_RESERVE); if (!skb && *err) return NULL; *err = sock_error(sk); if (*err) goto out; if (sk->sk_shutdown) { *err = -ECONNRESET; goto out; } return skb; out: kfree_skb(skb); return NULL; } int bt_to_errno(u16 code); void hci_sock_set_flag(struct sock *sk, int nr); void hci_sock_clear_flag(struct sock *sk, int nr); int hci_sock_test_flag(struct sock *sk, int nr); unsigned short hci_sock_get_channel(struct sock *sk); u32 hci_sock_get_cookie(struct sock *sk); int hci_sock_init(void); void hci_sock_cleanup(void); int bt_sysfs_init(void); void bt_sysfs_cleanup(void); int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, int (*seq_show)(struct seq_file *, void *)); void bt_procfs_cleanup(struct net *net, const char *name); extern struct dentry *bt_debugfs; int l2cap_init(void); void l2cap_exit(void); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_init(void); void sco_exit(void); #else static inline int sco_init(void) { return 0; } static inline void sco_exit(void) { } #endif int mgmt_init(void); void mgmt_exit(void); void bt_sock_reclassify_lock(struct sock *sk, int proto); #endif /* __BLUETOOTH_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 */ /* * This handles all read/write requests to block devices */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-pm.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> #include <linux/psi.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-rq-qos.h" struct dentry *blk_debugfs_root; EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set * @q: request queue */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_set); /** * blk_queue_flag_clear - atomically clear a queue flag * @flag: flag to be cleared * @q: request queue */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { clear_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_clear); /** * blk_queue_flag_test_and_set - atomically test and set a queue flag * @flag: flag to be set * @q: request queue * * Returns the previous value of @flag - 0 if the flag was not set and 1 if * the flag was already set. */ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) { return test_and_set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), REQ_OP_NAME(WRITE), REQ_OP_NAME(FLUSH), REQ_OP_NAME(DISCARD), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), REQ_OP_NAME(SCSI_OUT), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; #undef REQ_OP_NAME /** * blk_op_str - Return string XXX in the REQ_OP_XXX. * @op: REQ_OP_XXX. * * Description: Centralize block layer function to convert REQ_OP_XXX into * string format. Useful in the debugging and tracing bio or request. For * invalid REQ_OP_XXX it returns string "UNKNOWN". */ inline const char *blk_op_str(unsigned int op) { const char *op_str = "UNKNOWN"; if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) op_str = blk_op_name[op]; return op_str; } EXPORT_SYMBOL_GPL(blk_op_str); static const struct { int errno; const char *name; } blk_errors[] = { [BLK_STS_OK] = { 0, "" }, [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, /* zone device specific errors */ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; blk_status_t errno_to_blk_status(int errno) { int i; for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { if (blk_errors[i].errno == errno) return (__force blk_status_t)i; } return BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(errno_to_blk_status); int blk_status_to_errno(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return -EIO; return blk_errors[idx].errno; } EXPORT_SYMBOL_GPL(blk_status_to_errno); static void print_req_error(struct request *req, blk_status_t status, const char *caller) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return; printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", caller, blk_errors[idx].name, req->rq_disk ? req->rq_disk->disk_name : "?", blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), req->cmd_flags & ~REQ_OP_MASK, req->nr_phys_segments, IOPRIO_PRIO_CLASS(req->ioprio)); } static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { if (error) bio->bi_status = error; if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. */ if (bio->bi_iter.bi_size) bio->bi_status = BLK_STS_IOERR; else bio->bi_iter.bi_sector = rq->__sector; } /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: * The block layer may perform asynchronous callback activity * on a queue, such as calling the unplug function after a timeout. * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); } EXPORT_SYMBOL(blk_sync_queue); /** * blk_set_pm_only - increment pm_only counter * @q: request queue pointer */ void blk_set_pm_only(struct request_queue *q) { atomic_inc(&q->pm_only); } EXPORT_SYMBOL_GPL(blk_set_pm_only); void blk_clear_pm_only(struct request_queue *q) { int pm_only; pm_only = atomic_dec_return(&q->pm_only); WARN_ON_ONCE(pm_only < 0); if (pm_only == 0) wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_clear_pm_only); /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * * Decrements the refcount of the request_queue kobject. When this reaches 0 * we'll have blk_release_queue() called. * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } EXPORT_SYMBOL(blk_put_queue); void blk_set_queue_dying(struct request_queue *q) { blk_queue_flag_set(QUEUE_FLAG_DYING, q); /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ blk_freeze_queue_start(q); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_set_queue_dying); /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. * * Context: can sleep */ void blk_cleanup_queue(struct request_queue *q) { /* cannot be called from atomic context */ might_sleep(); WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ blk_set_queue_dying(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); /* * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that blk_mq_run_hw_queues() accesses the hardware queues * after draining finished. */ blk_freeze_queue(q); rq_qos_exit(q); blk_queue_flag_set(QUEUE_FLAG_DEAD, q); /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); if (queue_is_mq(q)) blk_mq_exit_queue(q); /* * In theory, request pool of sched_tags belongs to request queue. * However, the current implementation requires tag_set for freeing * requests, so free the pool now. * * Queue has become frozen, there can't be any in-queue requests, so * it is safe to free requests now. */ mutex_lock(&q->sysfs_lock); if (q->elevator) blk_mq_sched_free_requests(q); mutex_unlock(&q->sysfs_lock); percpu_ref_exit(&q->q_usage_counter); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; while (true) { bool success = false; rcu_read_lock(); if (percpu_ref_tryget_live(&q->q_usage_counter)) { /* * The code that increments the pm_only counter is * responsible for ensuring that that counter is * globally visible before the queue is unfrozen. */ if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) || !blk_queue_pm_only(q)) { success = true; } else { percpu_ref_put(&q->q_usage_counter); } } rcu_read_unlock(); if (success) return 0; if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* * read pair of barrier in blk_freeze_queue_start(), * we need to order reading __PERCPU_REF_DEAD flag of * .q_usage_counter and reading .mq_freeze_depth or * queue dying flag, otherwise the following wait may * never return if the two reads are reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; } } static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; bool nowait = bio->bi_opf & REQ_NOWAIT; int ret; ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); if (unlikely(ret)) { if (nowait && !blk_queue_dying(q)) bio_wouldblock_error(bio); else bio_io_error(bio); } return ret; } void blk_queue_exit(struct request_queue *q) { percpu_ref_put(&q->q_usage_counter); } static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); wake_up_all(&q->mq_freeze_wq); } static void blk_rq_timed_out_timer(struct timer_list *t) { struct request_queue *q = from_timer(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } static void blk_timeout_work(struct work_struct *work) { } struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) goto fail_q; ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (ret) goto fail_id; q->backing_dev_info = bdi_alloc(node_id); if (!q->backing_dev_info) goto fail_split; q->stats = blk_alloc_queue_stats(); if (!q->stats) goto fail_stats; q->node = node_id; atomic_set(&q->nr_active_requests_shared_sbitmap, 0); timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif kobject_init(&q->kobj, &blk_queue_ktype); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; if (blkcg_init_queue(q)) goto fail_ref; blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_MAX_RQ; return q; fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: blk_free_queue_stats(q->stats); fail_stats: bdi_put(q->backing_dev_info); fail_split: bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } EXPORT_SYMBOL(blk_alloc_queue); /** * blk_get_queue - increment the request_queue refcount * @q: the request_queue structure to increment the refcount for * * Increment the refcount of the request_queue kobject. * * Context: Any context. */ bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { __blk_get_queue(q); return true; } return false; } EXPORT_SYMBOL(blk_get_queue); /** * blk_get_request - allocate a request * @q: request queue to allocate a request for * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. */ struct request *blk_get_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { struct request *req; WARN_ON_ONCE(op & REQ_NOWAIT); WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM)); req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); return req; } EXPORT_SYMBOL(blk_get_request); void blk_put_request(struct request *req) { blk_mq_free_request(req); } EXPORT_SYMBOL(blk_put_request); static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; pr_info_ratelimited("attempt to access beyond end of device\n" "%s: rw=%d, want=%llu, limit=%llu\n", bio_devname(bio, b), bio->bi_opf, bio_end_sector(bio), maxsector); } #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return part->make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { const int op = bio_op(bio); if (part->policy && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return false; WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", bio_devname(bio, b), part->partno); /* Older lvm-tools actually trigger this */ return false; } return false; } static noinline int should_fail_bio(struct bio *bio) { if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) return -EIO; return 0; } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); /* * Check whether this bio extends beyond the end of the device or partition. * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) { unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && maxsector && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { handle_bad_sector(bio, maxsector); return -EIO; } return 0; } /* * Remap block n of partition p to block n+start(p) of the disk. */ static inline int blk_partition_remap(struct bio *bio) { struct hd_struct *p; int ret = -EIO; rcu_read_lock(); p = __disk_get_part(bio->bi_disk, bio->bi_partno); if (unlikely(!p)) goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) goto out; if (unlikely(bio_check_ro(bio, p))) goto out; if (bio_sectors(bio)) { if (bio_check_eod(bio, part_nr_sects_read(p))) goto out; bio->bi_iter.bi_sector += p->start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), bio->bi_iter.bi_sector - p->start_sect); } bio->bi_partno = 0; ret = 0; out: rcu_read_unlock(); return ret; } /* * Check write append to a zoned block device. */ static inline blk_status_t blk_check_zone_append(struct request_queue *q, struct bio *bio) { sector_t pos = bio->bi_iter.bi_sector; int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ if (!blk_queue_is_zoned(q)) return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ if (pos & (blk_queue_zone_sectors(q) - 1) || !blk_queue_zone_is_seq(q, pos)) return BLK_STS_IOERR; /* * Not allowed to cross zone boundaries. Otherwise, the BIO will be * split and could result in non-contiguous sectors being written in * different zones. */ if (nr_sectors > q->limits.chunk_sectors) return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ if (nr_sectors > q->limits.max_zone_append_sectors) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; return BLK_STS_OK; } static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; might_sleep(); plug = blk_mq_plug(q, bio); if (plug && plug->nowait) bio->bi_opf |= REQ_NOWAIT; /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) goto not_supported; if (should_fail_bio(bio)) goto end_io; if (bio->bi_partno) { if (unlikely(blk_partition_remap(bio))) goto end_io; } else { if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; } /* * Filter flush bio's early so that bio based drivers without flush * support don't have to worry about them. */ if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; goto end_io; } } if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) bio->bi_opf &= ~REQ_HIPRI; switch (bio_op(bio)) { case REQ_OP_DISCARD: if (!blk_queue_discard(q)) goto not_supported; break; case REQ_OP_SECURE_ERASE: if (!blk_queue_secure_erase(q)) goto not_supported; break; case REQ_OP_WRITE_SAME: if (!q->limits.max_write_same_sectors) goto not_supported; break; case REQ_OP_ZONE_APPEND: status = blk_check_zone_append(q, bio); if (status != BLK_STS_OK) goto end_io; break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: if (!blk_queue_is_zoned(q)) goto not_supported; break; case REQ_OP_ZONE_RESET_ALL: if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q)) goto not_supported; break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; default: break; } /* * Various block parts want %current->io_context, so allocate it up * front rather than dealing with lots of pain to allocate it only * where needed. This may fail and the block layer knows how to live * with it. */ if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); if (blk_throtl_bio(bio)) return false; blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(q, bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ bio_set_flag(bio, BIO_TRACE_COMPLETION); } return true; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); return false; } static blk_qc_t __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_disk; blk_qc_t ret = BLK_QC_T_NONE; if (blk_crypto_bio_prep(&bio)) { if (!disk->fops->submit_bio) return blk_mq_submit_bio(bio); ret = disk->fops->submit_bio(bio); } blk_queue_exit(disk->queue); return ret; } /* * The loop in this function may be a bit non-obvious, and so deserves some * explanation: * * - Before entering the loop, bio->bi_next is NULL (as all callers ensure * that), so we have a list with a single bio. * - We pretend that we have just taken it off a longer list, so we assign * bio_list to a pointer to the bio_list_on_stack, thus initialising the * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. * - In this case we really did just take the bio of the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio_bio, but that haven't been processed yet. */ static blk_qc_t __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; struct bio_list lower, same; if (unlikely(bio_queue_enter(bio) != 0)) continue; /* * Create a fresh bio_list for all subordinate requests. */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); ret = __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the * same level. */ bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bio->bi_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* * Now assemble so we handle the lowest level first. */ bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; return ret; } static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; blk_qc_t ret = BLK_QC_T_NONE; current->bio_list = bio_list; do { struct gendisk *disk = bio->bi_disk; if (unlikely(bio_queue_enter(bio) != 0)) continue; if (!blk_crypto_bio_prep(&bio)) { blk_queue_exit(disk->queue); ret = BLK_QC_T_NONE; continue; } ret = blk_mq_submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; return ret; } /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * * This is a version of submit_bio() that shall only be used for I/O that is * resubmitted to lower level drivers by stacking block drivers. All file * systems and other upper level users of the block layer should use * submit_bio() instead. */ blk_qc_t submit_bio_noacct(struct bio *bio) { if (!submit_bio_checks(bio)) return BLK_QC_T_NONE; /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ if (current->bio_list) { bio_list_add(&current->bio_list[0], bio); return BLK_QC_T_NONE; } if (!bio->bi_disk->fops->submit_bio) return __submit_bio_noacct_mq(bio); return __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The * bio will be send to the device described by the bi_disk and bi_partno fields. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has * been called. */ blk_qc_t submit_bio(struct bio *bio) { if (blkcg_punt_bio_submit(bio)) return BLK_QC_T_NONE; /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) count = queue_logical_block_size(bio->bi_disk->queue) >> 9; else count = bio_sectors(bio); if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), op_is_write(bio_op(bio)) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, bio_devname(bio, b), count); } } /* * If we're reading data that is part of the userspace workingset, count * submission time as memory stall. When the device is congested, or * the submitting cgroup IO-throttled, submission can be a significant * part of overall IO time. */ if (unlikely(bio_op(bio) == REQ_OP_READ && bio_flagged(bio, BIO_WORKINGSET))) { unsigned long pflags; blk_qc_t ret; psi_memstall_enter(&pflags); ret = submit_bio_noacct(bio); psi_memstall_leave(&pflags); return ret; } return submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for the new queue limits * @q: the queue * @rq: the request being checked * * Description: * @rq may have been made based on weaker limitations of upper-level queues * in request stacking drivers, and it may violate the limitation of @q. * Since the block layer and the underlying device driver trust @rq * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * * Request stacking drivers like request-based dm may change the queue * limits when retrying requests on other queues. Those requests need * to be checked against the new queue limits again during dispatch. */ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers. */ if (max_sectors == 0) return BLK_STS_NOTSUPP; printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", __func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR; } /* * queue's settings related to segment counting like q->bounce_pfn * may differ from that of other stacking queues. * Recalculate it to check the request correctly on this queue's * limitation. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", __func__, rq->nr_phys_segments, queue_max_segments(q)); return BLK_STS_IOERR; } return BLK_STS_OK; } /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { blk_status_t ret; ret = blk_cloned_rq_check_limits(q, rq); if (ret != BLK_STS_OK) return ret; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) return BLK_STS_IOERR; if (blk_queue_io_stat(q)) blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ return blk_mq_request_issue_directly(rq, true); } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_err_bytes - determine number of bytes till the next failure boundary * @rq: request to examine * * Description: * A request could be merge of IOs which require different failure * handling. This function determines the number of bytes which * can be failed from the beginning of the request without * crossing into area which need to be retried further. * * Return: * The number of bytes to fail. */ unsigned int blk_rq_err_bytes(const struct request *rq) { unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; unsigned int bytes = 0; struct bio *bio; if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* * Currently the only 'mixing' which can happen is between * different fastfail types. We can safely fail portions * which have all the failfast bits that the first one has - * the ones which are at least as eager to fail as the first * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_opf & ff) != ff) break; bytes += bio->bi_iter.bi_size; } /* this could lead to infinite loop */ BUG_ON(blk_rq_bytes(rq) && !bytes); return bytes; } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->stamp); if (unlikely(stamp != now)) { if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { part = &part_to_disk(part)->part0; goto again; } } static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; part_stat_lock(); part = req->part; part_stat_add(part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } void blk_account_io_done(struct request *req, u64 now) { /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if (req->part && blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; part_stat_lock(); part = req->part; update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); hd_struct_put(part); } } void blk_account_io_start(struct request *rq) { if (!blk_do_io_stat(rq)) return; rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); part_stat_lock(); update_io_ticks(rq->part, jiffies, false); part_stat_unlock(); } static unsigned long __part_start_io_acct(struct hd_struct *part, unsigned int sectors, unsigned int op) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); part_stat_lock(); update_io_ticks(part, now, false); part_stat_inc(part, ios[sgrp]); part_stat_add(part, sectors[sgrp], sectors); part_stat_local_inc(part, in_flight[op_is_write(op)]); part_stat_unlock(); return now; } unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, struct bio *bio) { *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); } EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { return __part_start_io_acct(&disk->part0, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); static void __part_end_io_acct(struct hd_struct *part, unsigned int op, unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(part, in_flight[op_is_write(op)]); part_stat_unlock(); } void part_end_io_acct(struct hd_struct *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); hd_struct_put(part); } EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { __part_end_io_acct(&disk->part0, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * This special helper function is only for request stacking drivers * (e.g. request-based dm) so that they can handle partial completion. * Actual device drivers should use blk_mq_end_request instead. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both * blk_rq_bytes() and in blk_update_request(). * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { int total_bytes; trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); if (!req->bio) return false; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) req->q->integrity.profile->complete_fn(req, nr_bytes); #endif if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error, __func__); blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); req_bio_endio(req, bio, bio_bytes, error); total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE /** * rq_flush_dcache_pages - Helper function to flush all pages in a request * @rq: the request to be flushed * * Description: * Flush all pages in @rq. */ void rq_flush_dcache_pages(struct request *rq) { struct req_iterator iter; struct bio_vec bvec; rq_for_each_segment(bvec, rq, iter) flush_dcache_page(bvec.bv_page); } EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); #endif /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (queue_is_mq(q) && q->mq_ops->busy) return q->mq_ops->busy(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio, *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else { rq->bio = rq->biotail = bio; } bio = NULL; } /* Copy attributes of the original request to the clone request. */ rq->__sector = blk_rq_pos(rq_src); rq->__data_len = blk_rq_bytes(rq_src); if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { rq->rq_flags |= RQF_SPECIAL_PAYLOAD; rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; return 0; free_and_out: if (bio) bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized * * Description: * blk_start_plug() indicates to the block layer an intent by the caller * to submit multiple I/O requests in a batch. The block layer may use * this hint to defer submitting I/Os from the caller until blk_finish_plug() * is called. However, the block layer may choose to submit requests * before a call to blk_finish_plug() if the number of queued I/Os * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if * the task schedules (see below). * * Tracking blk_plug inside the task_struct will help with auto-flushing the * pending I/O should the task end up blocking between blk_start_plug() and * blk_finish_plug(). This is important from a performance perspective, but * also ensures that we don't deadlock. For instance, if the task is blocking * for a memory allocation, memory reclaim could end up wanting to free a * page belonging to that request that is currently residing in our private * plug. By flushing the pending I/O when the process goes to sleep, we avoid * this kind of deadlock. */ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; /* * If this is a nested plug, don't actually assign it. */ if (tsk->plug) return; INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); plug->rq_count = 0; plug->multiple_queues = false; plug->nowait = false; /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); while (!list_empty(&plug->cb_list)) { list_splice_init(&plug->cb_list, &callbacks); while (!list_empty(&callbacks)) { struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); list_del(&cb->list); cb->callback(cb, from_schedule); } } } struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size) { struct blk_plug *plug = current->plug; struct blk_plug_cb *cb; if (!plug) return NULL; list_for_each_entry(cb, &plug->cb_list, list) if (cb->callback == unplug && cb->data == data) return cb; /* Not currently on the callback list */ BUG_ON(size < sizeof(*cb)); cb = kzalloc(size, GFP_ATOMIC); if (cb) { cb->data = data; cb->callback = unplug; list_add(&cb->list, &plug->cb_list); } return cb; } EXPORT_SYMBOL(blk_check_plugged); void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); } /** * blk_finish_plug - mark the end of a batch of submitted I/O * @plug: The &struct blk_plug passed to blk_start_plug() * * Description: * Indicate that a batch of I/O submissions is complete. This function * must be paired with an initial call to blk_start_plug(). The intent * is to allow the block layer to optimize I/O submission. See the * documentation for blk_start_plug() for more information. */ void blk_finish_plug(struct blk_plug *plug) { if (plug != current->plug) return; blk_flush_plug_list(plug, false); current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); void blk_io_schedule(void) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) io_schedule_timeout(timeout); else io_schedule(); } EXPORT_SYMBOL_GPL(blk_io_schedule); int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; }
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 // SPDX-License-Identifier: GPL-2.0 /* * Functions related to io context handling */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/slab.h> #include <linux/sched/task.h> #include "blk.h" /* * For io context allocations */ static struct kmem_cache *iocontext_cachep; /** * get_io_context - increment reference count to io_context * @ioc: io_context to get * * Increment reference count to @ioc. */ void get_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); } static void icq_free_icq_rcu(struct rcu_head *head) { struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); kmem_cache_free(icq->__rcu_icq_cache, icq); } /* * Exit an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_exit_icq(struct io_cq *icq) { struct elevator_type *et = icq->q->elevator->type; if (icq->flags & ICQ_EXITED) return; if (et->ops.exit_icq) et->ops.exit_icq(icq); icq->flags |= ICQ_EXITED; } /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. */ static void ioc_destroy_icq(struct io_cq *icq) { struct io_context *ioc = icq->ioc; struct request_queue *q = icq->q; struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); list_del_init(&icq->q_node); /* * Both setting lookup hint to and clearing it from @icq are done * under queue_lock. If it's not pointing to @icq now, it never * will. Hint assignment itself can race safely. */ if (rcu_access_pointer(ioc->icq_hint) == icq) rcu_assign_pointer(ioc->icq_hint, NULL); ioc_exit_icq(icq); /* * @icq->q might have gone away by the time RCU callback runs * making it impossible to determine icq_cache. Record it in @icq. */ icq->__rcu_icq_cache = et->icq_cache; icq->flags |= ICQ_DESTROYED; call_rcu(&icq->__rcu_head, icq_free_icq_rcu); } /* * Slow path for ioc release in put_io_context(). Performs double-lock * dancing to unlink all icq's and then frees ioc. */ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); spin_lock_irq(&ioc->lock); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, struct io_cq, ioc_node); struct request_queue *q = icq->q; if (spin_trylock(&q->queue_lock)) { ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { /* Make sure q and icq cannot be freed. */ rcu_read_lock(); /* Re-acquire the locks in the correct order. */ spin_unlock(&ioc->lock); spin_lock(&q->queue_lock); spin_lock(&ioc->lock); /* * The icq may have been destroyed when the ioc lock * was released. */ if (!(icq->flags & ICQ_DESTROYED)) ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); rcu_read_unlock(); } } spin_unlock_irq(&ioc->lock); kmem_cache_free(iocontext_cachep, ioc); } /** * put_io_context - put a reference of io_context * @ioc: io_context to put * * Decrement reference count of @ioc and release it if the count reaches * zero. */ void put_io_context(struct io_context *ioc) { unsigned long flags; bool free_ioc = false; if (ioc == NULL) return; BUG_ON(atomic_long_read(&ioc->refcount) <= 0); /* * Releasing ioc requires reverse order double locking and we may * already be holding a queue_lock. Do it asynchronously from wq. */ if (atomic_long_dec_and_test(&ioc->refcount)) { spin_lock_irqsave(&ioc->lock, flags); if (!hlist_empty(&ioc->icq_list)) queue_work(system_power_efficient_wq, &ioc->release_work); else free_ioc = true; spin_unlock_irqrestore(&ioc->lock, flags); } if (free_ioc) kmem_cache_free(iocontext_cachep, ioc); } /** * put_io_context_active - put active reference on ioc * @ioc: ioc of interest * * Undo get_io_context_active(). If active reference reaches zero after * put, @ioc can never issue further IOs and ioscheds are notified. */ void put_io_context_active(struct io_context *ioc) { struct io_cq *icq; if (!atomic_dec_and_test(&ioc->active_ref)) { put_io_context(ioc); return; } spin_lock_irq(&ioc->lock); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; ioc_exit_icq(icq); } spin_unlock_irq(&ioc->lock); put_io_context(ioc); } /* Called by the exiting task */ void exit_io_context(struct task_struct *task) { struct io_context *ioc; task_lock(task); ioc = task->io_context; task->io_context = NULL; task_unlock(task); atomic_dec(&ioc->nr_tasks); put_io_context_active(ioc); } static void __ioc_clear_queue(struct list_head *icq_list) { unsigned long flags; rcu_read_lock(); while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); if (icq->flags & ICQ_DESTROYED) { spin_unlock_irqrestore(&ioc->lock, flags); continue; } ioc_destroy_icq(icq); spin_unlock_irqrestore(&ioc->lock, flags); } rcu_read_unlock(); } /** * ioc_clear_queue - break any ioc association with the specified queue * @q: request_queue being cleared * * Walk @q->icq_list and exit all io_cq's. */ void ioc_clear_queue(struct request_queue *q) { LIST_HEAD(icq_list); spin_lock_irq(&q->queue_lock); list_splice_init(&q->icq_list, &icq_list); spin_unlock_irq(&q->queue_lock); __ioc_clear_queue(&icq_list); } int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) { struct io_context *ioc; int ret; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) return -ENOMEM; /* initialize */ atomic_long_set(&ioc->refcount, 1); atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); /* * Try to install. ioc shouldn't be installed if someone else * already did or @task, which isn't %current, is exiting. Note * that we need to allow ioc creation on exiting %current as exit * path may issue IOs from e.g. exit_files(). The exit path is * responsible for not issuing IO after exit_io_context(). */ task_lock(task); if (!task->io_context && (task == current || !(task->flags & PF_EXITING))) task->io_context = ioc; else kmem_cache_free(iocontext_cachep, ioc); ret = task->io_context ? 0 : -EBUSY; task_unlock(task); return ret; } /** * get_task_io_context - get io_context of a task * @task: task of interest * @gfp_flags: allocation flags, used if allocation is necessary * @node: allocation node, used if allocation is necessary * * Return io_context of @task. If it doesn't exist, it is created with * @gfp_flags and @node. The returned io_context has its reference count * incremented. * * This function always goes through task_lock() and it's better to use * %current->io_context + get_io_context() for %current. */ struct io_context *get_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) { struct io_context *ioc; might_sleep_if(gfpflags_allow_blocking(gfp_flags)); do { task_lock(task); ioc = task->io_context; if (likely(ioc)) { get_io_context(ioc); task_unlock(task); return ioc; } task_unlock(task); } while (!create_task_io_context(task, gfp_flags, node)); return NULL; } /** * ioc_lookup_icq - lookup io_cq from ioc * @ioc: the associated io_context * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) { struct io_cq *icq; lockdep_assert_held(&q->queue_lock); /* * icq's are indexed from @ioc using radix tree and hint pointer, * both of which are protected with RCU. All removals are done * holding both q and ioc locks, and we're holding q lock - if we * find a icq which points to us, it's guaranteed to be valid. */ rcu_read_lock(); icq = rcu_dereference(ioc->icq_hint); if (icq && icq->q == q) goto out; icq = radix_tree_lookup(&ioc->icq_tree, q->id); if (icq && icq->q == q) rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ else icq = NULL; out: rcu_read_unlock(); return icq; } EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq * @ioc: io_context of interest * @q: request_queue of interest * @gfp_mask: allocation mask * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. * * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, gfp_t gfp_mask) { struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, q->node); if (!icq) return NULL; if (radix_tree_maybe_preload(gfp_mask) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } icq->ioc = ioc; icq->q = q; INIT_LIST_HEAD(&icq->q_node); INIT_HLIST_NODE(&icq->ioc_node); /* lock both q and ioc and try to link @icq */ spin_lock_irq(&q->queue_lock); spin_lock(&ioc->lock); if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); if (et->ops.init_icq) et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(ioc, q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } spin_unlock(&ioc->lock); spin_unlock_irq(&q->queue_lock); radix_tree_preload_end(); return icq; } static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", sizeof(struct io_context), 0, SLAB_PANIC, NULL); return 0; } subsys_initcall(blk_ioc_init);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* delayacct.h - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 */ #ifndef _LINUX_DELAYACCT_H #define _LINUX_DELAYACCT_H #include <uapi/linux/taskstats.h> /* * Per-task flags relevant to delay accounting * maintained privately to avoid exhausting similar flags in sched.h:PF_* * Used to set current->delays->flags */ #define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ #define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; unsigned int flags; /* Private per-task flags */ /* For each stat XXX, add following, aligned appropriately * * struct timespec XXX_start, XXX_end; * u64 XXX_delay; * u32 XXX_count; * * Atomicity of updates to XXX_delay, XXX_count protected by * single lock above (split into XXX_lock if contention is an issue). */ /* * XXX_count is incremented on every XXX operation, the delay * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ u64 blkio_start; /* Shared by blkio, swapin */ u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_delay; /* wait for swapin block io completion */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ u32 swapin_count; /* total count of the number of swapin block */ /* io operations performed */ u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; u64 thrashing_delay; /* wait for thrashing page */ u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ }; #endif #include <linux/sched.h> #include <linux/slab.h> #ifdef CONFIG_TASK_DELAY_ACCT extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(struct task_struct *); extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { if (p->delays) return (p->delays->flags & DELAYACCT_PF_BLKIO); else return 0; } static inline void delayacct_set_flag(int flag) { if (current->delays) current->delays->flags |= flag; } static inline void delayacct_clear_flag(int flag) { if (current->delays) current->delays->flags &= ~flag; } static inline void delayacct_tsk_init(struct task_struct *tsk) { /* reinitialize in case parent's non-null pointer was dup'ed*/ tsk->delays = NULL; if (delayacct_on) __delayacct_tsk_init(tsk); } /* Free tsk->delays. Called from bad fork and __put_task_struct * where there's no risk of tsk->delays being accessed elsewhere */ static inline void delayacct_tsk_free(struct task_struct *tsk) { if (tsk->delays) kmem_cache_free(delayacct_cache, tsk->delays); tsk->delays = NULL; } static inline void delayacct_blkio_start(void) { delayacct_set_flag(DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } static inline void delayacct_blkio_end(struct task_struct *p) { if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(DELAYACCT_PF_BLKIO); } static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { if (!delayacct_on || !tsk->delays) return 0; return __delayacct_add_tsk(d, tsk); } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { if (tsk->delays) return __delayacct_blkio_ticks(tsk); return 0; } static inline void delayacct_freepages_start(void) { if (current->delays) __delayacct_freepages_start(); } static inline void delayacct_freepages_end(void) { if (current->delays) __delayacct_freepages_end(); } static inline void delayacct_thrashing_start(void) { if (current->delays) __delayacct_thrashing_start(); } static inline void delayacct_thrashing_end(void) { if (current->delays) __delayacct_thrashing_end(); } #else static inline void delayacct_set_flag(int flag) {} static inline void delayacct_clear_flag(int flag) {} static inline void delayacct_init(void) {} static inline void delayacct_tsk_init(struct task_struct *tsk) {} static inline void delayacct_tsk_free(struct task_struct *tsk) {} static inline void delayacct_blkio_start(void) {} static inline void delayacct_blkio_end(struct task_struct *p) {} static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { return 0; } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { return 0; } static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { return 0; } static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} static inline void delayacct_thrashing_start(void) {} static inline void delayacct_thrashing_end(void) {} #endif /* CONFIG_TASK_DELAY_ACCT */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 /* SPDX-License-Identifier: GPL-2.0 */ /* * bvec iterator * * Copyright (C) 2001 Ming Lei <ming.lei@canonical.com> */ #ifndef __LINUX_BVEC_ITER_H #define __LINUX_BVEC_ITER_H #include <linux/bug.h> #include <linux/errno.h> #include <linux/limits.h> #include <linux/minmax.h> #include <linux/mm.h> #include <linux/types.h> struct page; /** * struct bio_vec - a contiguous range of physical memory addresses * @bv_page: First page associated with the address range. * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. * * The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len: * * nth_page(@bv_page, n) == @bv_page + n * * This holds because page_is_mergeable() checks the above property. */ struct bio_vec { struct page *bv_page; unsigned int bv_len; unsigned int bv_offset; }; struct bvec_iter { sector_t bi_sector; /* device address in 512 byte sectors */ unsigned int bi_size; /* residual I/O count */ unsigned int bi_idx; /* current index into bvl_vec */ unsigned int bi_bvec_done; /* number of bytes completed in current bvec */ }; struct bvec_iter_all { struct bio_vec bv; int idx; unsigned done; }; /* * various member access, note that bio_data should of course not be used * on highmem page vectors */ #define __bvec_iter_bvec(bvec, iter) (&(bvec)[(iter).bi_idx]) /* multi-page (mp_bvec) helpers */ #define mp_bvec_iter_page(bvec, iter) \ (__bvec_iter_bvec((bvec), (iter))->bv_page) #define mp_bvec_iter_len(bvec, iter) \ min((iter).bi_size, \ __bvec_iter_bvec((bvec), (iter))->bv_len - (iter).bi_bvec_done) #define mp_bvec_iter_offset(bvec, iter) \ (__bvec_iter_bvec((bvec), (iter))->bv_offset + (iter).bi_bvec_done) #define mp_bvec_iter_page_idx(bvec, iter) \ (mp_bvec_iter_offset((bvec), (iter)) / PAGE_SIZE) #define mp_bvec_iter_bvec(bvec, iter) \ ((struct bio_vec) { \ .bv_page = mp_bvec_iter_page((bvec), (iter)), \ .bv_len = mp_bvec_iter_len((bvec), (iter)), \ .bv_offset = mp_bvec_iter_offset((bvec), (iter)), \ }) /* For building single-page bvec in flight */ #define bvec_iter_offset(bvec, iter) \ (mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE) #define bvec_iter_len(bvec, iter) \ min_t(unsigned, mp_bvec_iter_len((bvec), (iter)), \ PAGE_SIZE - bvec_iter_offset((bvec), (iter))) #define bvec_iter_page(bvec, iter) \ (mp_bvec_iter_page((bvec), (iter)) + \ mp_bvec_iter_page_idx((bvec), (iter))) #define bvec_iter_bvec(bvec, iter) \ ((struct bio_vec) { \ .bv_page = bvec_iter_page((bvec), (iter)), \ .bv_len = bvec_iter_len((bvec), (iter)), \ .bv_offset = bvec_iter_offset((bvec), (iter)), \ }) static inline bool bvec_iter_advance(const struct bio_vec *bv, struct bvec_iter *iter, unsigned bytes) { unsigned int idx = iter->bi_idx; if (WARN_ONCE(bytes > iter->bi_size, "Attempted to advance past end of bvec iter\n")) { iter->bi_size = 0; return false; } iter->bi_size -= bytes; bytes += iter->bi_bvec_done; while (bytes && bytes >= bv[idx].bv_len) { bytes -= bv[idx].bv_len; idx++; } iter->bi_idx = idx; iter->bi_bvec_done = bytes; return true; } static inline void bvec_iter_skip_zero_bvec(struct bvec_iter *iter) { iter->bi_bvec_done = 0; iter->bi_idx++; } #define for_each_bvec(bvl, bio_vec, iter, start) \ for (iter = (start); \ (iter).bi_size && \ ((bvl = bvec_iter_bvec((bio_vec), (iter))), 1); \ (bvl).bv_len ? (void)bvec_iter_advance((bio_vec), &(iter), \ (bvl).bv_len) : bvec_iter_skip_zero_bvec(&(iter))) /* for iterating one bio from start to end */ #define BVEC_ITER_ALL_INIT (struct bvec_iter) \ { \ .bi_sector = 0, \ .bi_size = UINT_MAX, \ .bi_idx = 0, \ .bi_bvec_done = 0, \ } static inline struct bio_vec *bvec_init_iter_all(struct bvec_iter_all *iter_all) { iter_all->done = 0; iter_all->idx = 0; return &iter_all->bv; } static inline void bvec_advance(const struct bio_vec *bvec, struct bvec_iter_all *iter_all) { struct bio_vec *bv = &iter_all->bv; if (iter_all->done) { bv->bv_page++; bv->bv_offset = 0; } else { bv->bv_page = bvec->bv_page + (bvec->bv_offset >> PAGE_SHIFT); bv->bv_offset = bvec->bv_offset & ~PAGE_MASK; } bv->bv_len = min_t(unsigned int, PAGE_SIZE - bv->bv_offset, bvec->bv_len - iter_all->done); iter_all->done += bv->bv_len; if (iter_all->done == bvec->bv_len) { iter_all->idx++; iter_all->done = 0; } } #endif /* __LINUX_BVEC_ITER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 /* SPDX-License-Identifier: GPL-2.0 */ /* * Because linux/module.h has tracepoints in the header, and ftrace.h * used to include this file, define_trace.h includes linux/module.h * But we do not want the module.h to override the TRACE_SYSTEM macro * variable that define_trace.h is processing, so we only set it * when module events are being processed, which would happen when * CREATE_TRACE_POINTS is defined. */ #ifdef CREATE_TRACE_POINTS #undef TRACE_SYSTEM #define TRACE_SYSTEM module #endif #if !defined(_TRACE_MODULE_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_MODULE_H #include <linux/tracepoint.h> #ifdef CONFIG_MODULES struct module; #define show_module_flags(flags) __print_flags(flags, "", \ { (1UL << TAINT_PROPRIETARY_MODULE), "P" }, \ { (1UL << TAINT_OOT_MODULE), "O" }, \ { (1UL << TAINT_FORCED_MODULE), "F" }, \ { (1UL << TAINT_CRAP), "C" }, \ { (1UL << TAINT_UNSIGNED_MODULE), "E" }) TRACE_EVENT(module_load, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __field( unsigned int, taints ) __string( name, mod->name ) ), TP_fast_assign( __entry->taints = mod->taints; __assign_str(name, mod->name); ), TP_printk("%s %s", __get_str(name), show_module_flags(__entry->taints)) ); TRACE_EVENT(module_free, TP_PROTO(struct module *mod), TP_ARGS(mod), TP_STRUCT__entry( __string( name, mod->name ) ), TP_fast_assign( __assign_str(name, mod->name); ), TP_printk("%s", __get_str(name)) ); #ifdef CONFIG_MODULE_UNLOAD /* trace_module_get/put are only used if CONFIG_MODULE_UNLOAD is defined */ DECLARE_EVENT_CLASS(module_refcnt, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( int, refcnt ) __string( name, mod->name ) ), TP_fast_assign( __entry->ip = ip; __entry->refcnt = atomic_read(&mod->refcnt); __assign_str(name, mod->name); ), TP_printk("%s call_site=%ps refcnt=%d", __get_str(name), (void *)__entry->ip, __entry->refcnt) ); DEFINE_EVENT(module_refcnt, module_get, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); DEFINE_EVENT(module_refcnt, module_put, TP_PROTO(struct module *mod, unsigned long ip), TP_ARGS(mod, ip) ); #endif /* CONFIG_MODULE_UNLOAD */ TRACE_EVENT(module_request, TP_PROTO(char *name, bool wait, unsigned long ip), TP_ARGS(name, wait, ip), TP_STRUCT__entry( __field( unsigned long, ip ) __field( bool, wait ) __string( name, name ) ), TP_fast_assign( __entry->ip = ip; __entry->wait = wait; __assign_str(name, name); ), TP_printk("%s wait=%d call_site=%ps", __get_str(name), (int)__entry->wait, (void *)__entry->ip) ); #endif /* CONFIG_MODULES */ #endif /* _TRACE_MODULE_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 /* SPDX-License-Identifier: GPL-2.0 */ /* * Filesystem access notification for Linux * * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #ifndef __LINUX_FSNOTIFY_BACKEND_H #define __LINUX_FSNOTIFY_BACKEND_H #ifdef __KERNEL__ #include <linux/idr.h> /* inotify uses this */ #include <linux/fs.h> /* struct inode */ #include <linux/list.h> #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/user_namespace.h> #include <linux/refcount.h> /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * convert between them. dnotify only needs conversion at watch creation * so no perf loss there. fanotify isn't defined yet, so it can use the * wholes if it needs more events. */ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ #define FS_CLOSE_WRITE 0x00000008 /* Writtable file was closed */ #define FS_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ #define FS_CREATE 0x00000100 /* Subfile was created */ #define FS_DELETE 0x00000200 /* Subfile was deleted */ #define FS_DELETE_SELF 0x00000400 /* Self was deleted */ #define FS_MOVE_SELF 0x00000800 /* Self was moved */ #define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ #define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. * Set on inode/sb/mount marks that care about parent/name info. */ #define FS_EVENT_ON_CHILD 0x08000000 #define FS_DN_RENAME 0x10000000 /* file renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_IN_ONESHOT 0x80000000 /* only send event once */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) /* * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) /* * This is a list of all events that may get sent to a parent that is watching * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory. */ #define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \ FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \ FS_OPEN | FS_OPEN_EXEC) /* * This is a list of all events that may get sent with the parent inode as the * @to_tell argument of fsnotify(). * It may include events that can be sent to an inode/sb/mount mark, but cannot * be sent to a parent watching children. */ #define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD) /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \ FS_DN_MULTISHOT | FS_EVENT_ON_CHILD) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; struct mem_cgroup; /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event * @group: group to notify * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * * handle_inode_event - simple variant of handle_event() for groups that only * have inode marks and don't have ignore mask * @mark: mark to notify * @mask: event type and flags * @inode: inode that event happened on * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be * dropped in this function. inotify uses this function to send * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; /* * all of the information about the original object we want to now send to * a group. If you want to carry more info from the accessing task to the * listener this structure is where you need to be adding fields. */ struct fsnotify_event { struct list_head list; unsigned long objectid; /* identifier for queue merges */ }; /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. * refcnt on a group is up to the implementor and at any moment if it goes 0 * everything will be cleaned up. */ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. * As an example, the dnotify group will always have a refcnt=1 and that * will never change. Inotify, on the other hand, has a group per * inotify_init() and the refcnt will hit 0 only when that fd has been * closed. */ refcount_t refcnt; /* things with interest in this group */ /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ /* * Valid fsnotify group priorities. Events are send in order from highest * priority to lowest priority. We default to the lowest priority. */ #define FS_PRIO_0 0 /* normal notifiers, no permissions */ #define FS_PRIO_1 1 /* fanotify content based access control */ #define FS_PRIO_2 2 /* fanotify pre-content access */ unsigned int priority; bool shutdown; /* group is being shut down, don't queue more events */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t num_marks; /* 1 for each mark and 1 for not being * past the point of no return when freeing * a group */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { void *private; #ifdef CONFIG_INOTIFY_USER struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ unsigned int max_marks; struct user_struct *user; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; }; /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); default: return NULL; } } static inline const struct path *fsnotify_data_path(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; default: return NULL; } } enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_PARENT, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; #define FSNOTIFY_OBJ_TYPE_INODE_FL (1U << FSNOTIFY_OBJ_TYPE_INODE) #define FSNOTIFY_OBJ_TYPE_PARENT_FL (1U << FSNOTIFY_OBJ_TYPE_PARENT) #define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL (1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT) #define FSNOTIFY_OBJ_TYPE_SB_FL (1U << FSNOTIFY_OBJ_TYPE_SB) #define FSNOTIFY_OBJ_ALL_TYPES_MASK ((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1) static inline bool fsnotify_valid_obj_type(unsigned int type) { return (type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT]; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( struct fsnotify_iter_info *iter_info, int type) { return (iter_info->report_mask & (1U << type)); } static inline void fsnotify_iter_set_report_type( struct fsnotify_iter_info *iter_info, int type) { iter_info->report_mask |= (1U << type); } static inline void fsnotify_iter_set_report_type_mark( struct fsnotify_iter_info *iter_info, int type, struct fsnotify_mark *mark) { iter_info->marks[type] = mark; iter_info->report_mask |= (1U << type); } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \ iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \ } FSNOTIFY_ITER_FUNCS(inode, INODE) FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_obj_type(type) \ for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++) /* * fsnotify_connp_t is what we embed in objects which connector can be attached * to. fsnotify_connp_t * is how we refer from connector back to object. */ struct fsnotify_mark_connector; typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this * structure. We destroy this structure when there are no more marks attached * to it. The structure is protected by fsnotify_mark_srcu. */ struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ #define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 unsigned short flags; /* flags [lock] */ __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ fsnotify_connp_t *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events * of a type matching mask or only interested in those events. * * These are flushed when an inode is evicted from core and may be flushed * when the inode is modified (as seen by fsnotify_access). Some fsnotify * users (such as dnotify) will flush these when the open fd is closed and not * at inode eviction or modification. * * Text in brackets is showing the lock(s) protecting modifications of a * particular entry. obj_lock means either inode->i_lock or * mnt->mnt_root->d_lock depending on the mark type. */ struct fsnotify_mark { /* Mask this mark is for [mark->lock, group->mark_mutex] */ __u32 mask; /* We hold one for presence in g_list. Also one ref for each 'thing' * in kernel that found and may be using this mark. */ refcount_t refcnt; /* Group this mark is for. Set on mark creation, stable until last ref * is dropped */ struct fsnotify_group *group; /* List of marks by group->marks_list. Also reused for queueing * mark into destroy_list when it's waiting for the end of SRCU period * before it can be freed. [group->mark_mutex] */ struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types to ignore [mark->lock, group->mark_mutex] */ __u32 ignored_mask; #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01 #define FSNOTIFY_MARK_FLAG_ALIVE 0x02 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x04 unsigned int flags; /* flags [mark->lock] */ }; #ifdef CONFIG_FSNOTIFY /* called from the vfs helpers */ /* main fsnotify call to send events */ extern int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie); extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); extern u32 fsnotify_get_cookie(void); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */ if (!(mask & FS_EVENT_ON_CHILD)) return 0; /* * This object might be watched by a mark that cares about parent/name * info, does it care about the specific set of events that can be * reported with parent/name info? */ return mask & FS_EVENTS_POSS_TO_PARENT; } static inline int fsnotify_inode_watches_children(struct inode *inode) { /* FS_EVENT_ON_CHILD is set if the inode may care */ if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; } /* * Update the dentry with a flag indicating the interest of its parent to receive * filesystem events when those events happens to this dentry->d_inode. */ static inline void fsnotify_update_flags(struct dentry *dentry) { assert_spin_locked(&dentry->d_lock); /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken * d_lock, the following __fsnotify_update_child_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; } /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); /* group destruction begins, stop queuing new events */ extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); /* Free event from memory */ extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ extern int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct list_head *, struct fsnotify_event *)); /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { fsnotify_add_event(group, group->overflow_event, NULL); } /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* Remove event queued in the notification list */ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event); /* functions used to manipulate the marks attached to inodes */ /* Get mask of events for a list of marks */ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn); /* Calculate mask of events for a list of marks */ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, struct fsnotify_group *group); /* Get cached fsid of filesystem containing object */ extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn, __kernel_fsid_t *fsid); /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, int allow_dups, __kernel_fsid_t *fsid); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, int allow_dups, __kernel_fsid_t *fsid); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int allow_dups) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int allow_dups) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL); } /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* detach mark from inode / mount list, group list, drop inode reference */ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); /* run all the marks in a group, and clear all of the marks attached to given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type); /* run all the marks in a group, and clear all of the vfsmount marks */ static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL); } /* run all the marks in a group, and clear all of the inode marks */ static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL); } /* run all the marks in a group, and clear all of the sn marks */ static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB_FL); } extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); static inline void fsnotify_init_event(struct fsnotify_event *event, unsigned long objectid) { INIT_LIST_HEAD(&event->list); event->objectid = objectid; } #else static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) { return 0; } static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { return 0; } static inline void __fsnotify_inode_delete(struct inode *inode) {} static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) {} static inline void fsnotify_sb_delete(struct super_block *sb) {} static inline void fsnotify_update_flags(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) { return 0; } static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ #endif /* __LINUX_FSNOTIFY_BACKEND_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef _ASM_X86_INSN_H #define _ASM_X86_INSN_H /* * x86 instruction analysis * * Copyright (C) IBM Corporation, 2009 */ /* insn_attr_t is defined in inat.h */ #include <asm/inat.h> struct insn_field { union { insn_value_t value; insn_byte_t bytes[4]; }; /* !0 if we've run insn_get_xxx() for this field */ unsigned char got; unsigned char nbytes; }; struct insn { struct insn_field prefixes; /* * Prefixes * prefixes.bytes[3]: last prefix */ struct insn_field rex_prefix; /* REX prefix */ struct insn_field vex_prefix; /* VEX prefix */ struct insn_field opcode; /* * opcode.bytes[0]: opcode1 * opcode.bytes[1]: opcode2 * opcode.bytes[2]: opcode3 */ struct insn_field modrm; struct insn_field sib; struct insn_field displacement; union { struct insn_field immediate; struct insn_field moffset1; /* for 64bit MOV */ struct insn_field immediate1; /* for 64bit imm or off16/32 */ }; union { struct insn_field moffset2; /* for 64bit MOV */ struct insn_field immediate2; /* for 64bit imm or seg16 */ }; int emulate_prefix_size; insn_attr_t attr; unsigned char opnd_bytes; unsigned char addr_bytes; unsigned char length; unsigned char x86_64; const insn_byte_t *kaddr; /* kernel address of insn to analyze */ const insn_byte_t *end_kaddr; /* kernel address of last insn in buffer */ const insn_byte_t *next_byte; }; #define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) #define X86_MODRM_RM(modrm) ((modrm) & 0x07) #define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6) #define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3) #define X86_SIB_BASE(sib) ((sib) & 0x07) #define X86_REX_W(rex) ((rex) & 8) #define X86_REX_R(rex) ((rex) & 4) #define X86_REX_X(rex) ((rex) & 2) #define X86_REX_B(rex) ((rex) & 1) /* VEX bit flags */ #define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */ #define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */ #define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */ #define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */ #define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */ /* VEX bit fields */ #define X86_EVEX_M(vex) ((vex) & 0x03) /* EVEX Byte1 */ #define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */ #define X86_VEX2_M 1 /* VEX2.M always 1 */ #define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */ #define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */ extern void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64); extern void insn_get_prefixes(struct insn *insn); extern void insn_get_opcode(struct insn *insn); extern void insn_get_modrm(struct insn *insn); extern void insn_get_sib(struct insn *insn); extern void insn_get_displacement(struct insn *insn); extern void insn_get_immediate(struct insn *insn); extern void insn_get_length(struct insn *insn); /* Attribute will be determined after getting ModRM (for opcode groups) */ static inline void insn_get_attribute(struct insn *insn) { insn_get_modrm(insn); } /* Instruction uses RIP-relative addressing */ extern int insn_rip_relative(struct insn *insn); /* Init insn for kernel text */ static inline void kernel_insn_init(struct insn *insn, const void *kaddr, int buf_len) { #ifdef CONFIG_X86_64 insn_init(insn, kaddr, buf_len, 1); #else /* CONFIG_X86_32 */ insn_init(insn, kaddr, buf_len, 0); #endif } static inline int insn_is_avx(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.value != 0); } static inline int insn_is_evex(struct insn *insn) { if (!insn->prefixes.got) insn_get_prefixes(insn); return (insn->vex_prefix.nbytes == 4); } static inline int insn_has_emulate_prefix(struct insn *insn) { return !!insn->emulate_prefix_size; } /* Ensure this instruction is decoded completely */ static inline int insn_complete(struct insn *insn) { return insn->opcode.got && insn->modrm.got && insn->sib.got && insn->displacement.got && insn->immediate.got; } static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX2_M; else if (insn->vex_prefix.nbytes == 3) /* 3 bytes VEX */ return X86_VEX3_M(insn->vex_prefix.bytes[1]); else /* EVEX */ return X86_EVEX_M(insn->vex_prefix.bytes[1]); } static inline insn_byte_t insn_vex_p_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ return X86_VEX_P(insn->vex_prefix.bytes[1]); else return X86_VEX_P(insn->vex_prefix.bytes[2]); } /* Get the last prefix id from last prefix or VEX prefix */ static inline int insn_last_prefix_id(struct insn *insn) { if (insn_is_avx(insn)) return insn_vex_p_bits(insn); /* VEX_p is a SIMD prefix id */ if (insn->prefixes.bytes[3]) return inat_get_last_prefix_id(insn->prefixes.bytes[3]); return 0; } /* Offset of each field from kaddr */ static inline int insn_offset_rex_prefix(struct insn *insn) { return insn->prefixes.nbytes; } static inline int insn_offset_vex_prefix(struct insn *insn) { return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes; } static inline int insn_offset_opcode(struct insn *insn) { return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes; } static inline int insn_offset_modrm(struct insn *insn) { return insn_offset_opcode(insn) + insn->opcode.nbytes; } static inline int insn_offset_sib(struct insn *insn) { return insn_offset_modrm(insn) + insn->modrm.nbytes; } static inline int insn_offset_displacement(struct insn *insn) { return insn_offset_sib(insn) + insn->sib.nbytes; } static inline int insn_offset_immediate(struct insn *insn) { return insn_offset_displacement(insn) + insn->displacement.nbytes; } /** * for_each_insn_prefix() -- Iterate prefixes in the instruction * @insn: Pointer to struct insn. * @idx: Index storage. * @prefix: Prefix byte. * * Iterate prefix bytes of given @insn. Each prefix byte is stored in @prefix * and the index is stored in @idx (note that this @idx is just for a cursor, * do not change it.) * Since prefixes.nbytes can be bigger than 4 if some prefixes * are repeated, it cannot be used for looping over the prefixes. */ #define for_each_insn_prefix(insn, idx, prefix) \ for (idx = 0; idx < ARRAY_SIZE(insn->prefixes.bytes) && (prefix = insn->prefixes.bytes[idx]) != 0; idx++) #define POP_SS_OPCODE 0x1f #define MOV_SREG_OPCODE 0x8e /* * Intel SDM Vol.3A 6.8.3 states; * "Any single-step trap that would be delivered following the MOV to SS * instruction or POP to SS instruction (because EFLAGS.TF is 1) is * suppressed." * This function returns true if @insn is MOV SS or POP SS. On these * instructions, single stepping is suppressed. */ static inline int insn_masking_exception(struct insn *insn) { return insn->opcode.bytes[0] == POP_SS_OPCODE || (insn->opcode.bytes[0] == MOV_SREG_OPCODE && X86_MODRM_REG(insn->modrm.bytes[0]) == 2); } #endif /* _ASM_X86_INSN_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PKEYS_H #define _ASM_X86_PKEYS_H #define ARCH_DEFAULT_PKEY 0 /* * If more than 16 keys are ever supported, a thorough audit * will be necessary to ensure that the types that store key * numbers and masks have sufficient capacity. */ #define arch_max_pkey() (boot_cpu_has(X86_FEATURE_OSPKE) ? 16 : 1) extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); static inline bool arch_pkeys_enabled(void) { return boot_cpu_has(X86_FEATURE_OSPKE); } /* * Try to dedicate one of the protection keys to be used as an * execute-only protection key. */ extern int __execute_only_pkey(struct mm_struct *mm); static inline int execute_only_pkey(struct mm_struct *mm) { if (!boot_cpu_has(X86_FEATURE_OSPKE)) return ARCH_DEFAULT_PKEY; return __execute_only_pkey(mm); } extern int __arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey); static inline int arch_override_mprotect_pkey(struct vm_area_struct *vma, int prot, int pkey) { if (!boot_cpu_has(X86_FEATURE_OSPKE)) return 0; return __arch_override_mprotect_pkey(vma, prot, pkey); } extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); #define ARCH_VM_PKEY_FLAGS (VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3) #define mm_pkey_allocation_map(mm) (mm->context.pkey_allocation_map) #define mm_set_pkey_allocated(mm, pkey) do { \ mm_pkey_allocation_map(mm) |= (1U << pkey); \ } while (0) #define mm_set_pkey_free(mm, pkey) do { \ mm_pkey_allocation_map(mm) &= ~(1U << pkey); \ } while (0) static inline bool mm_pkey_is_allocated(struct mm_struct *mm, int pkey) { /* * "Allocated" pkeys are those that have been returned * from pkey_alloc() or pkey 0 which is allocated * implicitly when the mm is created. */ if (pkey < 0) return false; if (pkey >= arch_max_pkey()) return false; /* * The exec-only pkey is set in the allocation map, but * is not available to any of the user interfaces like * mprotect_pkey(). */ if (pkey == mm->context.execute_only_pkey) return false; return mm_pkey_allocation_map(mm) & (1U << pkey); } /* * Returns a positive, 4-bit key on success, or -1 on failure. */ static inline int mm_pkey_alloc(struct mm_struct *mm) { /* * Note: this is the one and only place we make sure * that the pkey is valid as far as the hardware is * concerned. The rest of the kernel trusts that * only good, valid pkeys come out of here. */ u16 all_pkeys_mask = ((1U << arch_max_pkey()) - 1); int ret; /* * Are we out of pkeys? We must handle this specially * because ffz() behavior is undefined if there are no * zeros. */ if (mm_pkey_allocation_map(mm) == all_pkeys_mask) return -1; ret = ffz(mm_pkey_allocation_map(mm)); mm_set_pkey_allocated(mm, ret); return ret; } static inline int mm_pkey_free(struct mm_struct *mm, int pkey) { if (!mm_pkey_is_allocated(mm, pkey)) return -EINVAL; mm_set_pkey_free(mm, pkey); return 0; } extern int arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); extern int __arch_set_user_pkey_access(struct task_struct *tsk, int pkey, unsigned long init_val); extern void copy_init_pkru_to_fpregs(void); static inline int vma_pkey(struct vm_area_struct *vma) { unsigned long vma_pkey_mask = VM_PKEY_BIT0 | VM_PKEY_BIT1 | VM_PKEY_BIT2 | VM_PKEY_BIT3; return (vma->vm_flags & vma_pkey_mask) >> VM_PKEY_SHIFT; } #endif /*_ASM_X86_PKEYS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _IP6_FIB_H #define _IP6_FIB_H #include <linux/ipv6_route.h> #include <linux/rtnetlink.h> #include <linux/spinlock.h> #include <linux/notifier.h> #include <net/dst.h> #include <net/flow.h> #include <net/ip_fib.h> #include <net/netlink.h> #include <net/inetpeer.h> #include <net/fib_notifier.h> #include <linux/indirect_call_wrapper.h> #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_HASHSZ 256 #else #define FIB6_TABLE_HASHSZ 1 #endif #define RT6_DEBUG 2 #if RT6_DEBUG >= 3 #define RT6_TRACE(x...) pr_debug(x) #else #define RT6_TRACE(x...) do { ; } while (0) #endif struct rt6_info; struct fib6_info; struct fib6_config { u32 fc_table; u32 fc_metric; int fc_dst_len; int fc_src_len; int fc_ifindex; u32 fc_flags; u32 fc_protocol; u16 fc_type; /* only 8 bits are used */ u16 fc_delete_all_nh : 1, fc_ignore_dev_down:1, __unused : 14; u32 fc_nh_id; struct in6_addr fc_dst; struct in6_addr fc_src; struct in6_addr fc_prefsrc; struct in6_addr fc_gateway; unsigned long fc_expires; struct nlattr *fc_mx; int fc_mx_len; int fc_mp_len; struct nlattr *fc_mp; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; bool fc_is_fdb; }; struct fib6_node { struct fib6_node __rcu *parent; struct fib6_node __rcu *left; struct fib6_node __rcu *right; #ifdef CONFIG_IPV6_SUBTREES struct fib6_node __rcu *subtree; #endif struct fib6_info __rcu *leaf; __u16 fn_bit; /* bit key */ __u16 fn_flags; int fn_sernum; struct fib6_info __rcu *rr_ptr; struct rcu_head rcu; }; struct fib6_gc_args { int timeout; int more; }; #ifndef CONFIG_IPV6_SUBTREES #define FIB6_SUBTREE(fn) NULL static inline bool fib6_routes_require_src(const struct net *net) { return false; } static inline void fib6_routes_require_src_inc(struct net *net) {} static inline void fib6_routes_require_src_dec(struct net *net) {} #else static inline bool fib6_routes_require_src(const struct net *net) { return net->ipv6.fib6_routes_require_src > 0; } static inline void fib6_routes_require_src_inc(struct net *net) { net->ipv6.fib6_routes_require_src++; } static inline void fib6_routes_require_src_dec(struct net *net) { net->ipv6.fib6_routes_require_src--; } #define FIB6_SUBTREE(fn) (rcu_dereference_protected((fn)->subtree, 1)) #endif /* * routing information * */ struct rt6key { struct in6_addr addr; int plen; }; struct fib6_table; struct rt6_exception_bucket { struct hlist_head chain; int depth; }; struct rt6_exception { struct hlist_node hlist; struct rt6_info *rt6i; unsigned long stamp; struct rcu_head rcu; }; #define FIB6_EXCEPTION_BUCKET_SIZE_SHIFT 10 #define FIB6_EXCEPTION_BUCKET_SIZE (1 << FIB6_EXCEPTION_BUCKET_SIZE_SHIFT) #define FIB6_MAX_DEPTH 5 struct fib6_nh { struct fib_nh_common nh_common; #ifdef CONFIG_IPV6_ROUTER_PREF unsigned long last_probe; #endif struct rt6_info * __percpu *rt6i_pcpu; struct rt6_exception_bucket __rcu *rt6i_exception_bucket; }; struct fib6_info { struct fib6_table *fib6_table; struct fib6_info __rcu *fib6_next; struct fib6_node __rcu *fib6_node; /* Multipath routes: * siblings is a list of fib6_info that have the same metric/weight, * destination, but not the same gateway. nsiblings is just a cache * to speed up lookup. */ union { struct list_head fib6_siblings; struct list_head nh_list; }; unsigned int fib6_nsiblings; refcount_t fib6_ref; unsigned long expires; struct dst_metrics *fib6_metrics; #define fib6_pmtu fib6_metrics->metrics[RTAX_MTU-1] struct rt6key fib6_dst; u32 fib6_flags; struct rt6key fib6_src; struct rt6key fib6_prefsrc; u32 fib6_metric; u8 fib6_protocol; u8 fib6_type; u8 should_flush:1, dst_nocount:1, dst_nopolicy:1, fib6_destroying:1, offload:1, trap:1, unused:2; struct rcu_head rcu; struct nexthop *nh; struct fib6_nh fib6_nh[]; }; struct rt6_info { struct dst_entry dst; struct fib6_info __rcu *from; int sernum; struct rt6key rt6i_dst; struct rt6key rt6i_src; struct in6_addr rt6i_gateway; struct inet6_dev *rt6i_idev; u32 rt6i_flags; struct list_head rt6i_uncached; struct uncached_list *rt6i_uncached_list; /* more non-fragment space at head required */ unsigned short rt6i_nfheader_len; }; struct fib6_result { struct fib6_nh *nh; struct fib6_info *f6i; u32 fib6_flags; u8 fib6_type; struct rt6_info *rt6; }; #define for_each_fib6_node_rt_rcu(fn) \ for (rt = rcu_dereference((fn)->leaf); rt; \ rt = rcu_dereference(rt->fib6_next)) #define for_each_fib6_walker_rt(w) \ for (rt = (w)->leaf; rt; \ rt = rcu_dereference_protected(rt->fib6_next, 1)) static inline struct inet6_dev *ip6_dst_idev(struct dst_entry *dst) { return ((struct rt6_info *)dst)->rt6i_idev; } static inline bool fib6_requires_src(const struct fib6_info *rt) { return rt->fib6_src.plen > 0; } static inline void fib6_clean_expires(struct fib6_info *f6i) { f6i->fib6_flags &= ~RTF_EXPIRES; f6i->expires = 0; } static inline void fib6_set_expires(struct fib6_info *f6i, unsigned long expires) { f6i->expires = expires; f6i->fib6_flags |= RTF_EXPIRES; } static inline bool fib6_check_expired(const struct fib6_info *f6i) { if (f6i->fib6_flags & RTF_EXPIRES) return time_after(jiffies, f6i->expires); return false; } /* Function to safely get fn->sernum for passed in rt * and store result in passed in cookie. * Return true if we can get cookie safely * Return false if not */ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, u32 *cookie) { struct fib6_node *fn; bool status = false; fn = rcu_dereference(f6i->fib6_node); if (fn) { *cookie = fn->fn_sernum; /* pairs with smp_wmb() in fib6_update_sernum_upto_root() */ smp_rmb(); status = true; } return status; } static inline u32 rt6_get_cookie(const struct rt6_info *rt) { struct fib6_info *from; u32 cookie = 0; if (rt->sernum) return rt->sernum; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) fib6_get_cookie_safe(from, &cookie); rcu_read_unlock(); return cookie; } static inline void ip6_rt_put(struct rt6_info *rt) { /* dst_release() accepts a NULL parameter. * We rely on dst being first structure in struct rt6_info */ BUILD_BUG_ON(offsetof(struct rt6_info, dst) != 0); dst_release(&rt->dst); } struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh); void fib6_info_destroy_rcu(struct rcu_head *head); static inline void fib6_info_hold(struct fib6_info *f6i) { refcount_inc(&f6i->fib6_ref); } static inline bool fib6_info_hold_safe(struct fib6_info *f6i) { return refcount_inc_not_zero(&f6i->fib6_ref); } static inline void fib6_info_release(struct fib6_info *f6i) { if (f6i && refcount_dec_and_test(&f6i->fib6_ref)) call_rcu(&f6i->rcu, fib6_info_destroy_rcu); } static inline void fib6_info_hw_flags_set(struct fib6_info *f6i, bool offload, bool trap) { f6i->offload = offload; f6i->trap = trap; } enum fib6_walk_state { #ifdef CONFIG_IPV6_SUBTREES FWS_S, #endif FWS_L, FWS_R, FWS_C, FWS_U }; struct fib6_walker { struct list_head lh; struct fib6_node *root, *node; struct fib6_info *leaf; enum fib6_walk_state state; unsigned int skip; unsigned int count; unsigned int skip_in_node; int (*func)(struct fib6_walker *); void *args; }; struct rt6_statistics { __u32 fib_nodes; /* all fib6 nodes */ __u32 fib_route_nodes; /* intermediate nodes */ __u32 fib_rt_entries; /* rt entries in fib table */ __u32 fib_rt_cache; /* cached rt entries in exception table */ __u32 fib_discarded_routes; /* total number of routes delete */ /* The following stats are not protected by any lock */ atomic_t fib_rt_alloc; /* total number of routes alloced */ atomic_t fib_rt_uncache; /* rt entries in uncached list */ }; #define RTN_TL_ROOT 0x0001 #define RTN_ROOT 0x0002 /* tree root node */ #define RTN_RTINFO 0x0004 /* node with valid routing info */ /* * priority levels (or metrics) * */ struct fib6_table { struct hlist_node tb6_hlist; u32 tb6_id; spinlock_t tb6_lock; struct fib6_node tb6_root; struct inet_peer_base tb6_peers; unsigned int flags; unsigned int fib_seq; #define RT6_TABLE_HAS_DFLT_ROUTER BIT(0) }; #define RT6_TABLE_UNSPEC RT_TABLE_UNSPEC #define RT6_TABLE_MAIN RT_TABLE_MAIN #define RT6_TABLE_DFLT RT6_TABLE_MAIN #define RT6_TABLE_INFO RT6_TABLE_MAIN #define RT6_TABLE_PREFIX RT6_TABLE_MAIN #ifdef CONFIG_IPV6_MULTIPLE_TABLES #define FIB6_TABLE_MIN 1 #define FIB6_TABLE_MAX RT_TABLE_MAX #define RT6_TABLE_LOCAL RT_TABLE_LOCAL #else #define FIB6_TABLE_MIN RT_TABLE_MAIN #define FIB6_TABLE_MAX FIB6_TABLE_MIN #define RT6_TABLE_LOCAL RT6_TABLE_MAIN #endif typedef struct rt6_info *(*pol_lookup_t)(struct net *, struct fib6_table *, struct flowi6 *, const struct sk_buff *, int); struct fib6_entry_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib6_info *rt; unsigned int nsiblings; }; /* * exported functions */ struct fib6_table *fib6_get_table(struct net *net, u32 id); struct fib6_table *fib6_new_table(struct net *net, u32 id); struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6, const struct sk_buff *skb, int flags, pol_lookup_t lookup); /* called with rcu lock held; can return error pointer * caller needs to select path */ int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6, struct fib6_result *res, int flags); /* called with rcu lock held; caller needs to select path */ int fib6_table_lookup(struct net *net, struct fib6_table *table, int oif, struct flowi6 *fl6, struct fib6_result *res, int strict); void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict); struct fib6_node *fib6_node_lookup(struct fib6_node *root, const struct in6_addr *daddr, const struct in6_addr *saddr); struct fib6_node *fib6_locate(struct fib6_node *root, const struct in6_addr *daddr, int dst_len, const struct in6_addr *saddr, int src_len, bool exact_match); void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); void fib6_clean_all_skip_notify(struct net *net, int (*func)(struct fib6_info *, void *arg), void *arg); int fib6_add(struct fib6_node *root, struct fib6_info *rt, struct nl_info *info, struct netlink_ext_ack *extack); int fib6_del(struct fib6_info *rt, struct nl_info *info); static inline void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr) { const struct fib6_info *from; rcu_read_lock(); from = rcu_dereference(rt->from); if (from) { *addr = from->fib6_prefsrc.addr; } else { struct in6_addr in6_zero = {}; *addr = in6_zero; } rcu_read_unlock(); } int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib6_nh_release(struct fib6_nh *fib6_nh); void fib6_nh_release_dsts(struct fib6_nh *fib6_nh); int call_fib6_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, struct netlink_ext_ack *extack); int call_fib6_multipath_entry_notifiers(struct net *net, enum fib_event_type event_type, struct fib6_info *rt, unsigned int nsiblings, struct netlink_ext_ack *extack); int call_fib6_entry_notifiers_replace(struct net *net, struct fib6_info *rt); void fib6_rt_update(struct net *net, struct fib6_info *rt, struct nl_info *info); void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, unsigned int flags); void fib6_run_gc(unsigned long expires, struct net *net, bool force); void fib6_gc_cleanup(void); int fib6_init(void); struct ipv6_route_iter { struct seq_net_private p; struct fib6_walker w; loff_t skip; struct fib6_table *tbl; int sernum; }; extern const struct seq_operations ipv6_route_seq_ops; int call_fib6_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib6_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib6_notifier_init(struct net *net); void __net_exit fib6_notifier_exit(struct net *net); unsigned int fib6_tables_seq_read(struct net *net); int fib6_tables_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); void fib6_update_sernum(struct net *net, struct fib6_info *rt); void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt); void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i); void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val); static inline bool fib6_metric_locked(struct fib6_info *f6i, int metric) { return !!(f6i->fib6_metrics->metrics[RTAX_LOCK - 1] & (1 << metric)); } #if IS_BUILTIN(CONFIG_IPV6) && defined(CONFIG_BPF_SYSCALL) struct bpf_iter__ipv6_route { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct fib6_info *, rt); }; #endif INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *__ip6_route_redirect(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); INDIRECT_CALLABLE_DECLARE(struct rt6_info *ip6_pol_route_lookup(struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags)); static inline struct rt6_info *pol_lookup_func(pol_lookup_t lookup, struct net *net, struct fib6_table *table, struct flowi6 *fl6, const struct sk_buff *skb, int flags) { return INDIRECT_CALL_4(lookup, ip6_pol_route_output, ip6_pol_route_input, ip6_pol_route_lookup, __ip6_route_redirect, net, table, fl6, skb, flags); } #ifdef CONFIG_IPV6_MULTIPLE_TABLES static inline bool fib6_has_custom_rules(const struct net *net) { return net->ipv6.fib6_has_custom_rules; } int fib6_rules_init(void); void fib6_rules_cleanup(void); bool fib6_rule_default(const struct fib_rule *rule); int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib6_rules_seq_read(struct net *net); static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv6.fib6_rules_require_fldissect) return false; skb_flow_dissect_flow_keys(skb, flkeys, flag); fl6->fl6_sport = flkeys->ports.src; fl6->fl6_dport = flkeys->ports.dst; fl6->flowi6_proto = flkeys->basic.ip_proto; return true; } #else static inline bool fib6_has_custom_rules(const struct net *net) { return false; } static inline int fib6_rules_init(void) { return 0; } static inline void fib6_rules_cleanup(void) { return ; } static inline bool fib6_rule_default(const struct fib_rule *rule) { return true; } static inline int fib6_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib6_rules_seq_read(struct net *net) { return 0; } static inline bool fib6_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi6 *fl6, struct flow_keys *flkeys) { return false; } #endif #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 // SPDX-License-Identifier: GPL-2.0-only /* * Generic pidhash and scalable, time-bounded PID allocator * * (C) 2002-2003 Nadia Yvette Chambers, IBM * (C) 2004 Nadia Yvette Chambers, Oracle * (C) 2002-2004 Ingo Molnar, Red Hat * * pid-structures are backing objects for tasks sharing a given ID to chain * against. There is very little to them aside from hashing them and * parking tasks using given ID's on a list. * * The hash is always changed with the tasklist_lock write-acquired, * and the hash is only accessed with the tasklist_lock at least * read-acquired, so there's no additional SMP locking needed here. * * We have a list of bitmap pages, which bitmaps represent the PID space. * Allocating and freeing PIDs is completely lockless. The worst-case * allocation scenario when all but one out of 1 million PIDs possible are * allocated already: the scanning of 32 list entries and at most PAGE_SIZE * bytes. The typical fastpath is a single successful setbit. Freeing is O(1). * * Pid namespaces: * (C) 2007 Pavel Emelyanov <xemul@openvz.org>, OpenVZ, SWsoft Inc. * (C) 2007 Sukadev Bhattiprolu <sukadev@us.ibm.com>, IBM * Many thanks to Oleg Nesterov for comments and help * */ #include <linux/mm.h> #include <linux/export.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/rculist.h> #include <linux/memblock.h> #include <linux/pid_namespace.h> #include <linux/init_task.h> #include <linux/syscalls.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/anon_inodes.h> #include <linux/sched/signal.h> #include <linux/sched/task.h> #include <linux/idr.h> #include <net/sock.h> #include <uapi/linux/pidfd.h> struct pid init_struct_pid = { .count = REFCOUNT_INIT(1), .tasks = { { .first = NULL }, { .first = NULL }, { .first = NULL }, }, .level = 0, .numbers = { { .nr = 0, .ns = &init_pid_ns, }, } }; int pid_max = PID_MAX_DEFAULT; #define RESERVED_PIDS 300 int pid_max_min = RESERVED_PIDS + 1; int pid_max_max = PID_MAX_LIMIT; /* * PID-map pages start out as NULL, they get allocated upon * first use and are never deallocated. This way a low pid_max * value does not cause lots of bitmaps to be allocated, but * the scheme scales to up to 4 million PIDs, runtime. */ struct pid_namespace init_pid_ns = { .kref = KREF_INIT(2), .idr = IDR_INIT(init_pid_ns.idr), .pid_allocated = PIDNS_ADDING, .level = 0, .child_reaper = &init_task, .user_ns = &init_user_ns, .ns.inum = PROC_PID_INIT_INO, #ifdef CONFIG_PID_NS .ns.ops = &pidns_operations, #endif }; EXPORT_SYMBOL_GPL(init_pid_ns); /* * Note: disable interrupts while the pidmap_lock is held as an * interrupt might come in and do read_lock(&tasklist_lock). * * If we don't disable interrupts there is a nasty deadlock between * detach_pid()->free_pid() and another cpu that does * spin_lock(&pidmap_lock) followed by an interrupt routine that does * read_lock(&tasklist_lock); * * After we clean up the tasklist_lock and know there are no * irq handlers that take it we can leave the interrupts enabled. * For now it is easier to be safe than to prove it can't happen. */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock); void put_pid(struct pid *pid) { struct pid_namespace *ns; if (!pid) return; ns = pid->numbers[pid->level].ns; if (refcount_dec_and_test(&pid->count)) { kmem_cache_free(ns->pid_cachep, pid); put_pid_ns(ns); } } EXPORT_SYMBOL_GPL(put_pid); static void delayed_put_pid(struct rcu_head *rhp) { struct pid *pid = container_of(rhp, struct pid, rcu); put_pid(pid); } void free_pid(struct pid *pid) { /* We can be called with write_lock_irq(&tasklist_lock) held */ int i; unsigned long flags; spin_lock_irqsave(&pidmap_lock, flags); for (i = 0; i <= pid->level; i++) { struct upid *upid = pid->numbers + i; struct pid_namespace *ns = upid->ns; switch (--ns->pid_allocated) { case 2: case 1: /* When all that is left in the pid namespace * is the reaper wake up the reaper. The reaper * may be sleeping in zap_pid_ns_processes(). */ wake_up_process(ns->child_reaper); break; case PIDNS_ADDING: /* Handle a fork failure of the first process */ WARN_ON(ns->child_reaper); ns->pid_allocated = 0; break; } idr_remove(&ns->idr, upid->nr); } spin_unlock_irqrestore(&pidmap_lock, flags); call_rcu(&pid->rcu, delayed_put_pid); } struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid, size_t set_tid_size) { struct pid *pid; enum pid_type type; int i, nr; struct pid_namespace *tmp; struct upid *upid; int retval = -ENOMEM; /* * set_tid_size contains the size of the set_tid array. Starting at * the most nested currently active PID namespace it tells alloc_pid() * which PID to set for a process in that most nested PID namespace * up to set_tid_size PID namespaces. It does not have to set the PID * for a process in all nested PID namespaces but set_tid_size must * never be greater than the current ns->level + 1. */ if (set_tid_size > ns->level + 1) return ERR_PTR(-EINVAL); pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL); if (!pid) return ERR_PTR(retval); tmp = ns; pid->level = ns->level; for (i = ns->level; i >= 0; i--) { int tid = 0; if (set_tid_size) { tid = set_tid[ns->level - i]; retval = -EINVAL; if (tid < 1 || tid >= pid_max) goto out_free; /* * Also fail if a PID != 1 is requested and * no PID 1 exists. */ if (tid != 1 && !tmp->child_reaper) goto out_free; retval = -EPERM; if (!checkpoint_restore_ns_capable(tmp->user_ns)) goto out_free; set_tid_size--; } idr_preload(GFP_KERNEL); spin_lock_irq(&pidmap_lock); if (tid) { nr = idr_alloc(&tmp->idr, NULL, tid, tid + 1, GFP_ATOMIC); /* * If ENOSPC is returned it means that the PID is * alreay in use. Return EEXIST in that case. */ if (nr == -ENOSPC) nr = -EEXIST; } else { int pid_min = 1; /* * init really needs pid 1, but after reaching the * maximum wrap back to RESERVED_PIDS */ if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS) pid_min = RESERVED_PIDS; /* * Store a null pointer so find_pid_ns does not find * a partially initialized PID (see below). */ nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min, pid_max, GFP_ATOMIC); } spin_unlock_irq(&pidmap_lock); idr_preload_end(); if (nr < 0) { retval = (nr == -ENOSPC) ? -EAGAIN : nr; goto out_free; } pid->numbers[i].nr = nr; pid->numbers[i].ns = tmp; tmp = tmp->parent; } /* * ENOMEM is not the most obvious choice especially for the case * where the child subreaper has already exited and the pid * namespace denies the creation of any new processes. But ENOMEM * is what we have exposed to userspace for a long time and it is * documented behavior for pid namespaces. So we can't easily * change it even if there were an error code better suited. */ retval = -ENOMEM; get_pid_ns(ns); refcount_set(&pid->count, 1); spin_lock_init(&pid->lock); for (type = 0; type < PIDTYPE_MAX; ++type) INIT_HLIST_HEAD(&pid->tasks[type]); init_waitqueue_head(&pid->wait_pidfd); INIT_HLIST_HEAD(&pid->inodes); upid = pid->numbers + ns->level; spin_lock_irq(&pidmap_lock); if (!(ns->pid_allocated & PIDNS_ADDING)) goto out_unlock; for ( ; upid >= pid->numbers; --upid) { /* Make the PID visible to find_pid_ns. */ idr_replace(&upid->ns->idr, pid, upid->nr); upid->ns->pid_allocated++; } spin_unlock_irq(&pidmap_lock); return pid; out_unlock: spin_unlock_irq(&pidmap_lock); put_pid_ns(ns); out_free: spin_lock_irq(&pidmap_lock); while (++i <= ns->level) { upid = pid->numbers + i; idr_remove(&upid->ns->idr, upid->nr); } /* On failure to allocate the first pid, reset the state */ if (ns->pid_allocated == PIDNS_ADDING) idr_set_cursor(&ns->idr, 0); spin_unlock_irq(&pidmap_lock); kmem_cache_free(ns->pid_cachep, pid); return ERR_PTR(retval); } void disable_pid_allocation(struct pid_namespace *ns) { spin_lock_irq(&pidmap_lock); ns->pid_allocated &= ~PIDNS_ADDING; spin_unlock_irq(&pidmap_lock); } struct pid *find_pid_ns(int nr, struct pid_namespace *ns) { return idr_find(&ns->idr, nr); } EXPORT_SYMBOL_GPL(find_pid_ns); struct pid *find_vpid(int nr) { return find_pid_ns(nr, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(find_vpid); static struct pid **task_pid_ptr(struct task_struct *task, enum pid_type type) { return (type == PIDTYPE_PID) ? &task->thread_pid : &task->signal->pids[type]; } /* * attach_pid() must be called with the tasklist_lock write-held. */ void attach_pid(struct task_struct *task, enum pid_type type) { struct pid *pid = *task_pid_ptr(task, type); hlist_add_head_rcu(&task->pid_links[type], &pid->tasks[type]); } static void __change_pid(struct task_struct *task, enum pid_type type, struct pid *new) { struct pid **pid_ptr = task_pid_ptr(task, type); struct pid *pid; int tmp; pid = *pid_ptr; hlist_del_rcu(&task->pid_links[type]); *pid_ptr = new; for (tmp = PIDTYPE_MAX; --tmp >= 0; ) if (pid_has_task(pid, tmp)) return; free_pid(pid); } void detach_pid(struct task_struct *task, enum pid_type type) { __change_pid(task, type, NULL); } void change_pid(struct task_struct *task, enum pid_type type, struct pid *pid) { __change_pid(task, type, pid); attach_pid(task, type); } void exchange_tids(struct task_struct *left, struct task_struct *right) { struct pid *pid1 = left->thread_pid; struct pid *pid2 = right->thread_pid; struct hlist_head *head1 = &pid1->tasks[PIDTYPE_PID]; struct hlist_head *head2 = &pid2->tasks[PIDTYPE_PID]; /* Swap the single entry tid lists */ hlists_swap_heads_rcu(head1, head2); /* Swap the per task_struct pid */ rcu_assign_pointer(left->thread_pid, pid2); rcu_assign_pointer(right->thread_pid, pid1); /* Swap the cached value */ WRITE_ONCE(left->pid, pid_nr(pid2)); WRITE_ONCE(right->pid, pid_nr(pid1)); } /* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */ void transfer_pid(struct task_struct *old, struct task_struct *new, enum pid_type type) { if (type == PIDTYPE_PID) new->thread_pid = old->thread_pid; hlist_replace_rcu(&old->pid_links[type], &new->pid_links[type]); } struct task_struct *pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result = NULL; if (pid) { struct hlist_node *first; first = rcu_dereference_check(hlist_first_rcu(&pid->tasks[type]), lockdep_tasklist_lock_is_held()); if (first) result = hlist_entry(first, struct task_struct, pid_links[(type)]); } return result; } EXPORT_SYMBOL(pid_task); /* * Must be called under rcu_read_lock(). */ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns) { RCU_LOCKDEP_WARN(!rcu_read_lock_held(), "find_task_by_pid_ns() needs rcu_read_lock() protection"); return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID); } struct task_struct *find_task_by_vpid(pid_t vnr) { return find_task_by_pid_ns(vnr, task_active_pid_ns(current)); } struct task_struct *find_get_task_by_vpid(pid_t nr) { struct task_struct *task; rcu_read_lock(); task = find_task_by_vpid(nr); if (task) get_task_struct(task); rcu_read_unlock(); return task; } struct pid *get_task_pid(struct task_struct *task, enum pid_type type) { struct pid *pid; rcu_read_lock(); pid = get_pid(rcu_dereference(*task_pid_ptr(task, type))); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(get_task_pid); struct task_struct *get_pid_task(struct pid *pid, enum pid_type type) { struct task_struct *result; rcu_read_lock(); result = pid_task(pid, type); if (result) get_task_struct(result); rcu_read_unlock(); return result; } EXPORT_SYMBOL_GPL(get_pid_task); struct pid *find_get_pid(pid_t nr) { struct pid *pid; rcu_read_lock(); pid = get_pid(find_vpid(nr)); rcu_read_unlock(); return pid; } EXPORT_SYMBOL_GPL(find_get_pid); pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) { struct upid *upid; pid_t nr = 0; if (pid && ns->level <= pid->level) { upid = &pid->numbers[ns->level]; if (upid->ns == ns) nr = upid->nr; } return nr; } EXPORT_SYMBOL_GPL(pid_nr_ns); pid_t pid_vnr(struct pid *pid) { return pid_nr_ns(pid, task_active_pid_ns(current)); } EXPORT_SYMBOL_GPL(pid_vnr); pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, struct pid_namespace *ns) { pid_t nr = 0; rcu_read_lock(); if (!ns) ns = task_active_pid_ns(current); nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); rcu_read_unlock(); return nr; } EXPORT_SYMBOL(__task_pid_nr_ns); struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); } EXPORT_SYMBOL_GPL(task_active_pid_ns); /* * Used by proc to find the first pid that is greater than or equal to nr. * * If there is a pid at nr this function is exactly the same as find_pid_ns. */ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) { return idr_get_next(&ns->idr, &nr); } struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags) { struct fd f; struct pid *pid; f = fdget(fd); if (!f.file) return ERR_PTR(-EBADF); pid = pidfd_pid(f.file); if (!IS_ERR(pid)) { get_pid(pid); *flags = f.file->f_flags; } fdput(f); return pid; } /** * pidfd_create() - Create a new pid file descriptor. * * @pid: struct pid that the pidfd will reference * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set. * * Note, that this function can only be called after the fd table has * been unshared to avoid leaking the pidfd to the new process. * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ static int pidfd_create(struct pid *pid, unsigned int flags) { int fd; fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), flags | O_RDWR | O_CLOEXEC); if (fd < 0) put_pid(pid); return fd; } /** * pidfd_open() - Open new pid file descriptor. * * @pid: pid for which to retrieve a pidfd * @flags: flags to pass * * This creates a new pid file descriptor with the O_CLOEXEC flag set for * the process identified by @pid. Currently, the process identified by * @pid must be a thread-group leader. This restriction currently exists * for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot * be used with CLONE_THREAD) and pidfd polling (only supports thread group * leaders). * * Return: On success, a cloexec pidfd is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags) { int fd; struct pid *p; if (flags & ~PIDFD_NONBLOCK) return -EINVAL; if (pid <= 0) return -EINVAL; p = find_get_pid(pid); if (!p) return -ESRCH; if (pid_has_task(p, PIDTYPE_TGID)) fd = pidfd_create(p, flags); else fd = -EINVAL; put_pid(p); return fd; } void __init pid_idr_init(void) { /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_ADDING); /* bump default and minimum pid_max based on number of cpus */ pid_max = min(pid_max_max, max_t(int, pid_max, PIDS_PER_CPU_DEFAULT * num_possible_cpus())); pid_max_min = max_t(int, pid_max_min, PIDS_PER_CPU_MIN * num_possible_cpus()); pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); idr_init(&init_pid_ns.idr); init_pid_ns.pid_cachep = KMEM_CACHE(pid, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } static struct file *__pidfd_fget(struct task_struct *task, int fd) { struct file *file; int ret; ret = down_read_killable(&task->signal->exec_update_lock); if (ret) return ERR_PTR(ret); if (ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS)) file = fget_task(task, fd); else file = ERR_PTR(-EPERM); up_read(&task->signal->exec_update_lock); return file ?: ERR_PTR(-EBADF); } static int pidfd_getfd(struct pid *pid, int fd) { struct task_struct *task; struct file *file; int ret; task = get_pid_task(pid, PIDTYPE_PID); if (!task) return -ESRCH; file = __pidfd_fget(task, fd); put_task_struct(task); if (IS_ERR(file)) return PTR_ERR(file); ret = receive_fd(file, O_CLOEXEC); fput(file); return ret; } /** * sys_pidfd_getfd() - Get a file descriptor from another process * * @pidfd: the pidfd file descriptor of the process * @fd: the file descriptor number to get * @flags: flags on how to get the fd (reserved) * * This syscall gets a copy of a file descriptor from another process * based on the pidfd, and file descriptor number. It requires that * the calling process has the ability to ptrace the process represented * by the pidfd. The process which is having its file descriptor copied * is otherwise unaffected. * * Return: On success, a cloexec file descriptor is returned. * On error, a negative errno number will be returned. */ SYSCALL_DEFINE3(pidfd_getfd, int, pidfd, int, fd, unsigned int, flags) { struct pid *pid; struct fd f; int ret; /* flags is currently unused - make sure it's unset */ if (flags) return -EINVAL; f = fdget(pidfd); if (!f.file) return -EBADF; pid = pidfd_pid(f.file); if (IS_ERR(pid)) ret = PTR_ERR(pid); else ret = pidfd_getfd(pid, fd); fdput(f); return ret; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM timer #if !defined(_TRACE_TIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TIMER_H #include <linux/tracepoint.h> #include <linux/hrtimer.h> #include <linux/timer.h> DECLARE_EVENT_CLASS(timer_class, TP_PROTO(struct timer_list *timer), TP_ARGS(timer), TP_STRUCT__entry( __field( void *, timer ) ), TP_fast_assign( __entry->timer = timer; ), TP_printk("timer=%p", __entry->timer) ); /** * timer_init - called when the timer is initialized * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_init, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); #define decode_timer_flags(flags) \ __print_flags(flags, "|", \ { TIMER_MIGRATING, "M" }, \ { TIMER_DEFERRABLE, "D" }, \ { TIMER_PINNED, "P" }, \ { TIMER_IRQSAFE, "I" }) /** * timer_start - called when the timer is started * @timer: pointer to struct timer_list * @expires: the timers expiry time */ TRACE_EVENT(timer_start, TP_PROTO(struct timer_list *timer, unsigned long expires, unsigned int flags), TP_ARGS(timer, expires, flags), TP_STRUCT__entry( __field( void *, timer ) __field( void *, function ) __field( unsigned long, expires ) __field( unsigned long, now ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->timer = timer; __entry->function = timer->function; __entry->expires = expires; __entry->now = jiffies; __entry->flags = flags; ), TP_printk("timer=%p function=%ps expires=%lu [timeout=%ld] cpu=%u idx=%u flags=%s", __entry->timer, __entry->function, __entry->expires, (long)__entry->expires - __entry->now, __entry->flags & TIMER_CPUMASK, __entry->flags >> TIMER_ARRAYSHIFT, decode_timer_flags(__entry->flags & TIMER_TRACE_FLAGMASK)) ); /** * timer_expire_entry - called immediately before the timer callback * @timer: pointer to struct timer_list * * Allows to determine the timer latency. */ TRACE_EVENT(timer_expire_entry, TP_PROTO(struct timer_list *timer, unsigned long baseclk), TP_ARGS(timer, baseclk), TP_STRUCT__entry( __field( void *, timer ) __field( unsigned long, now ) __field( void *, function) __field( unsigned long, baseclk ) ), TP_fast_assign( __entry->timer = timer; __entry->now = jiffies; __entry->function = timer->function; __entry->baseclk = baseclk; ), TP_printk("timer=%p function=%ps now=%lu baseclk=%lu", __entry->timer, __entry->function, __entry->now, __entry->baseclk) ); /** * timer_expire_exit - called immediately after the timer callback returns * @timer: pointer to struct timer_list * * When used in combination with the timer_expire_entry tracepoint we can * determine the runtime of the timer callback function. * * NOTE: Do NOT derefernce timer in TP_fast_assign. The pointer might * be invalid. We solely track the pointer. */ DEFINE_EVENT(timer_class, timer_expire_exit, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); /** * timer_cancel - called when the timer is canceled * @timer: pointer to struct timer_list */ DEFINE_EVENT(timer_class, timer_cancel, TP_PROTO(struct timer_list *timer), TP_ARGS(timer) ); #define decode_clockid(type) \ __print_symbolic(type, \ { CLOCK_REALTIME, "CLOCK_REALTIME" }, \ { CLOCK_MONOTONIC, "CLOCK_MONOTONIC" }, \ { CLOCK_BOOTTIME, "CLOCK_BOOTTIME" }, \ { CLOCK_TAI, "CLOCK_TAI" }) #define decode_hrtimer_mode(mode) \ __print_symbolic(mode, \ { HRTIMER_MODE_ABS, "ABS" }, \ { HRTIMER_MODE_REL, "REL" }, \ { HRTIMER_MODE_ABS_PINNED, "ABS|PINNED" }, \ { HRTIMER_MODE_REL_PINNED, "REL|PINNED" }, \ { HRTIMER_MODE_ABS_SOFT, "ABS|SOFT" }, \ { HRTIMER_MODE_REL_SOFT, "REL|SOFT" }, \ { HRTIMER_MODE_ABS_PINNED_SOFT, "ABS|PINNED|SOFT" }, \ { HRTIMER_MODE_REL_PINNED_SOFT, "REL|PINNED|SOFT" }) /** * hrtimer_init - called when the hrtimer is initialized * @hrtimer: pointer to struct hrtimer * @clockid: the hrtimers clock * @mode: the hrtimers mode */ TRACE_EVENT(hrtimer_init, TP_PROTO(struct hrtimer *hrtimer, clockid_t clockid, enum hrtimer_mode mode), TP_ARGS(hrtimer, clockid, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( clockid_t, clockid ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->clockid = clockid; __entry->mode = mode; ), TP_printk("hrtimer=%p clockid=%s mode=%s", __entry->hrtimer, decode_clockid(__entry->clockid), decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_start - called when the hrtimer is started * @hrtimer: pointer to struct hrtimer */ TRACE_EVENT(hrtimer_start, TP_PROTO(struct hrtimer *hrtimer, enum hrtimer_mode mode), TP_ARGS(hrtimer, mode), TP_STRUCT__entry( __field( void *, hrtimer ) __field( void *, function ) __field( s64, expires ) __field( s64, softexpires ) __field( enum hrtimer_mode, mode ) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->function = hrtimer->function; __entry->expires = hrtimer_get_expires(hrtimer); __entry->softexpires = hrtimer_get_softexpires(hrtimer); __entry->mode = mode; ), TP_printk("hrtimer=%p function=%ps expires=%llu softexpires=%llu " "mode=%s", __entry->hrtimer, __entry->function, (unsigned long long) __entry->expires, (unsigned long long) __entry->softexpires, decode_hrtimer_mode(__entry->mode)) ); /** * hrtimer_expire_entry - called immediately before the hrtimer callback * @hrtimer: pointer to struct hrtimer * @now: pointer to variable which contains current time of the * timers base. * * Allows to determine the timer latency. */ TRACE_EVENT(hrtimer_expire_entry, TP_PROTO(struct hrtimer *hrtimer, ktime_t *now), TP_ARGS(hrtimer, now), TP_STRUCT__entry( __field( void *, hrtimer ) __field( s64, now ) __field( void *, function) ), TP_fast_assign( __entry->hrtimer = hrtimer; __entry->now = *now; __entry->function = hrtimer->function; ), TP_printk("hrtimer=%p function=%ps now=%llu", __entry->hrtimer, __entry->function, (unsigned long long) __entry->now) ); DECLARE_EVENT_CLASS(hrtimer_class, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer), TP_STRUCT__entry( __field( void *, hrtimer ) ), TP_fast_assign( __entry->hrtimer = hrtimer; ), TP_printk("hrtimer=%p", __entry->hrtimer) ); /** * hrtimer_expire_exit - called immediately after the hrtimer callback returns * @hrtimer: pointer to struct hrtimer * * When used in combination with the hrtimer_expire_entry tracepoint we can * determine the runtime of the callback function. */ DEFINE_EVENT(hrtimer_class, hrtimer_expire_exit, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * hrtimer_cancel - called when the hrtimer is canceled * @hrtimer: pointer to struct hrtimer */ DEFINE_EVENT(hrtimer_class, hrtimer_cancel, TP_PROTO(struct hrtimer *hrtimer), TP_ARGS(hrtimer) ); /** * itimer_state - called when itimer is started or canceled * @which: name of the interval timer * @value: the itimers value, itimer is canceled if value->it_value is * zero, otherwise it is started * @expires: the itimers expiry time */ TRACE_EVENT(itimer_state, TP_PROTO(int which, const struct itimerspec64 *const value, unsigned long long expires), TP_ARGS(which, value, expires), TP_STRUCT__entry( __field( int, which ) __field( unsigned long long, expires ) __field( long, value_sec ) __field( long, value_nsec ) __field( long, interval_sec ) __field( long, interval_nsec ) ), TP_fast_assign( __entry->which = which; __entry->expires = expires; __entry->value_sec = value->it_value.tv_sec; __entry->value_nsec = value->it_value.tv_nsec; __entry->interval_sec = value->it_interval.tv_sec; __entry->interval_nsec = value->it_interval.tv_nsec; ), TP_printk("which=%d expires=%llu it_value=%ld.%06ld it_interval=%ld.%06ld", __entry->which, __entry->expires, __entry->value_sec, __entry->value_nsec / NSEC_PER_USEC, __entry->interval_sec, __entry->interval_nsec / NSEC_PER_USEC) ); /** * itimer_expire - called when itimer expires * @which: type of the interval timer * @pid: pid of the process which owns the timer * @now: current time, used to calculate the latency of itimer */ TRACE_EVENT(itimer_expire, TP_PROTO(int which, struct pid *pid, unsigned long long now), TP_ARGS(which, pid, now), TP_STRUCT__entry( __field( int , which ) __field( pid_t, pid ) __field( unsigned long long, now ) ), TP_fast_assign( __entry->which = which; __entry->now = now; __entry->pid = pid_nr(pid); ), TP_printk("which=%d pid=%d now=%llu", __entry->which, (int) __entry->pid, __entry->now) ); #ifdef CONFIG_NO_HZ_COMMON #define TICK_DEP_NAMES \ tick_dep_mask_name(NONE) \ tick_dep_name(POSIX_TIMER) \ tick_dep_name(PERF_EVENTS) \ tick_dep_name(SCHED) \ tick_dep_name(CLOCK_UNSTABLE) \ tick_dep_name_end(RCU) #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end /* The MASK will convert to their bits and they need to be processed too */ #define tick_dep_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); #define tick_dep_name_end(sdep) TRACE_DEFINE_ENUM(TICK_DEP_BIT_##sdep); \ TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); /* NONE only has a mask defined for it */ #define tick_dep_mask_name(sdep) TRACE_DEFINE_ENUM(TICK_DEP_MASK_##sdep); TICK_DEP_NAMES #undef tick_dep_name #undef tick_dep_mask_name #undef tick_dep_name_end #define tick_dep_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_mask_name(sdep) { TICK_DEP_MASK_##sdep, #sdep }, #define tick_dep_name_end(sdep) { TICK_DEP_MASK_##sdep, #sdep } #define show_tick_dep_name(val) \ __print_symbolic(val, TICK_DEP_NAMES) TRACE_EVENT(tick_stop, TP_PROTO(int success, int dependency), TP_ARGS(success, dependency), TP_STRUCT__entry( __field( int , success ) __field( int , dependency ) ), TP_fast_assign( __entry->success = success; __entry->dependency = dependency; ), TP_printk("success=%d dependency=%s", __entry->success, \ show_tick_dep_name(__entry->dependency)) ); #endif #endif /*