1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef LLIST_H #define LLIST_H /* * Lock-less NULL terminated single linked list * * Cases where locking is not needed: * If there are multiple producers and multiple consumers, llist_add can be * used in producers and llist_del_all can be used in consumers simultaneously * without locking. Also a single consumer can use llist_del_first while * multiple producers simultaneously use llist_add, without any locking. * * Cases where locking is needed: * If we have multiple consumers with llist_del_first used in one consumer, and * llist_del_first or llist_del_all used in other consumers, then a lock is * needed. This is because llist_del_first depends on list->first->next not * changing, but without lock protection, there's no way to be sure about that * if a preemption happens in the middle of the delete operation and on being * preempted back, the list->first is the same as before causing the cmpxchg in * llist_del_first to succeed. For example, while a llist_del_first operation * is in progress in one consumer, then a llist_del_first, llist_add, * llist_add (or llist_del_all, llist_add, llist_add) sequence in another * consumer may cause violations. * * This can be summarized as follows: * * | add | del_first | del_all * add | - | - | - * del_first | | L | L * del_all | | | - * * Where, a particular row's operation can happen concurrently with a column's * operation, with "-" being no lock needed, while "L" being lock is needed. * * The list entries deleted via llist_del_all can be traversed with * traversing function such as llist_for_each etc. But the list * entries can not be traversed safely before deleted from the list. * The order of deleted entries is from the newest to the oldest added * one. If you want to traverse from the oldest to the newest, you * must reverse the order by yourself before traversing. * * The basic atomic operation of this list is cmpxchg on long. On * architectures that don't have NMI-safe cmpxchg implementation, the * list can NOT be used in NMI handlers. So code that uses the list in * an NMI handler should depend on CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG. * * Copyright 2010,2011 Intel Corp. * Author: Huang Ying <ying.huang@intel.com> */ #include <linux/atomic.h> #include <linux/kernel.h> struct llist_head { struct llist_node *first; }; struct llist_node { struct llist_node *next; }; #define LLIST_HEAD_INIT(name) { NULL } #define LLIST_HEAD(name) struct llist_head name = LLIST_HEAD_INIT(name) /** * init_llist_head - initialize lock-less list head * @head: the head for your lock-less list */ static inline void init_llist_head(struct llist_head *list) { list->first = NULL; } /** * llist_entry - get the struct of this entry * @ptr: the &struct llist_node pointer. * @type: the type of the struct this is embedded in. * @member: the name of the llist_node within the struct. */ #define llist_entry(ptr, type, member) \ container_of(ptr, type, member) /** * member_address_is_nonnull - check whether the member address is not NULL * @ptr: the object pointer (struct type * that contains the llist_node) * @member: the name of the llist_node within the struct. * * This macro is conceptually the same as * &ptr->member != NULL * but it works around the fact that compilers can decide that taking a member * address is never a NULL pointer. * * Real objects that start at a high address and have a member at NULL are * unlikely to exist, but such pointers may be returned e.g. by the * container_of() macro. */ #define member_address_is_nonnull(ptr, member) \ ((uintptr_t)(ptr) + offsetof(typeof(*(ptr)), member) != 0) /** * llist_for_each - iterate over some deleted entries of a lock-less list * @pos: the &struct llist_node to use as a loop cursor * @node: the first entry of deleted list entries * * In general, some entries of the lock-less list can be traversed * safely only after being deleted from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each(pos, node) \ for ((pos) = (node); pos; (pos) = (pos)->next) /** * llist_for_each_safe - iterate over some deleted entries of a lock-less list * safe against removal of list entry * @pos: the &struct llist_node to use as a loop cursor * @n: another &struct llist_node to use as temporary storage * @node: the first entry of deleted list entries * * In general, some entries of the lock-less list can be traversed * safely only after being deleted from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_safe(pos, n, node) \ for ((pos) = (node); (pos) && ((n) = (pos)->next, true); (pos) = (n)) /** * llist_for_each_entry - iterate over some deleted entries of lock-less list of given type * @pos: the type * to use as a loop cursor. * @node: the fist entry of deleted list entries. * @member: the name of the llist_node with the struct. * * In general, some entries of the lock-less list can be traversed * safely only after being removed from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_entry(pos, node, member) \ for ((pos) = llist_entry((node), typeof(*(pos)), member); \ member_address_is_nonnull(pos, member); \ (pos) = llist_entry((pos)->member.next, typeof(*(pos)), member)) /** * llist_for_each_entry_safe - iterate over some deleted entries of lock-less list of given type * safe against removal of list entry * @pos: the type * to use as a loop cursor. * @n: another type * to use as temporary storage * @node: the first entry of deleted list entries. * @member: the name of the llist_node with the struct. * * In general, some entries of the lock-less list can be traversed * safely only after being removed from list, so start with an entry * instead of list head. * * If being used on entries deleted from lock-less list directly, the * traverse order is from the newest to the oldest added entry. If * you want to traverse from the oldest to the newest, you must * reverse the order by yourself before traversing. */ #define llist_for_each_entry_safe(pos, n, node, member) \ for (pos = llist_entry((node), typeof(*pos), member); \ member_address_is_nonnull(pos, member) && \ (n = llist_entry(pos->member.next, typeof(*n), member), true); \ pos = n) /** * llist_empty - tests whether a lock-less list is empty * @head: the list to test * * Not guaranteed to be accurate or up to date. Just a quick way to * test whether the list is empty without deleting something from the * list. */ static inline bool llist_empty(const struct llist_head *head) { return READ_ONCE(head->first) == NULL; } static inline struct llist_node *llist_next(struct llist_node *node) { return node->next; } extern bool llist_add_batch(struct llist_node *new_first, struct llist_node *new_last, struct llist_head *head); /** * llist_add - add a new entry * @new: new entry to be added * @head: the head for your lock-less list * * Returns true if the list was empty prior to adding this entry. */ static inline bool llist_add(struct llist_node *new, struct llist_head *head) { return llist_add_batch(new, new, head); } /** * llist_del_all - delete all entries from lock-less list * @head: the head of lock-less list to delete all entries * * If list is empty, return NULL, otherwise, delete all entries and * return the pointer to the first entry. The order of entries * deleted is from the newest to the oldest added one. */ static inline struct llist_node *llist_del_all(struct llist_head *head) { return xchg(&head->first, NULL); } extern struct llist_node *llist_del_first(struct llist_head *head); struct llist_node *llist_reverse_order(struct llist_node *head); #endif /* LLIST_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLKDEV_H #define _LINUX_BLKDEV_H #include <linux/sched.h> #include <linux/sched/clock.h> #include <linux/major.h> #include <linux/genhd.h> #include <linux/list.h> #include <linux/llist.h> #include <linux/minmax.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/pagemap.h> #include <linux/backing-dev-defs.h> #include <linux/wait.h> #include <linux/mempool.h> #include <linux/pfn.h> #include <linux/bio.h> #include <linux/stringify.h> #include <linux/gfp.h> #include <linux/bsg.h> #include <linux/smp.h> #include <linux/rcupdate.h> #include <linux/percpu-refcount.h> #include <linux/scatterlist.h> #include <linux/blkzoned.h> #include <linux/pm.h> struct module; struct scsi_ioctl_command; struct request_queue; struct elevator_queue; struct blk_trace; struct request; struct sg_io_hdr; struct bsg_job; struct blkcg_gq; struct blk_flush_queue; struct pr_ops; struct rq_qos; struct blk_queue_stats; struct blk_stat_callback; struct blk_keyslot_manager; #define BLKDEV_MIN_RQ 4 #define BLKDEV_MAX_RQ 128 /* Default maximum */ /* Must be consistent with blk_mq_poll_stats_bkt() */ #define BLK_MQ_POLL_STATS_BKTS 16 /* Doing classic polling */ #define BLK_MQ_POLL_CLASSIC -1 /* * Maximum number of blkcg policies allowed to be registered concurrently. * Defined here to simplify include dependency. */ #define BLKCG_MAX_POLS 5 static inline int blk_validate_block_size(unsigned int bsize) { if (bsize < 512 || bsize > PAGE_SIZE || !is_power_of_2(bsize)) return -EINVAL; return 0; } typedef void (rq_end_io_fn)(struct request *, blk_status_t); /* * request flags */ typedef __u32 __bitwise req_flags_t; /* elevator knows about this request */ #define RQF_SORTED ((__force req_flags_t)(1 << 0)) /* drive already may have started this one */ #define RQF_STARTED ((__force req_flags_t)(1 << 1)) /* may not be passed by ioscheduler */ #define RQF_SOFTBARRIER ((__force req_flags_t)(1 << 3)) /* request for flush sequence */ #define RQF_FLUSH_SEQ ((__force req_flags_t)(1 << 4)) /* merge of different types, fail separately */ #define RQF_MIXED_MERGE ((__force req_flags_t)(1 << 5)) /* track inflight for MQ */ #define RQF_MQ_INFLIGHT ((__force req_flags_t)(1 << 6)) /* don't call prep for this one */ #define RQF_DONTPREP ((__force req_flags_t)(1 << 7)) /* vaguely specified driver internal error. Ignored by the block layer */ #define RQF_FAILED ((__force req_flags_t)(1 << 10)) /* don't warn about errors */ #define RQF_QUIET ((__force req_flags_t)(1 << 11)) /* elevator private data attached */ #define RQF_ELVPRIV ((__force req_flags_t)(1 << 12)) /* account into disk and partition IO statistics */ #define RQF_IO_STAT ((__force req_flags_t)(1 << 13)) /* request came from our alloc pool */ #define RQF_ALLOCED ((__force req_flags_t)(1 << 14)) /* runtime pm request */ #define RQF_PM ((__force req_flags_t)(1 << 15)) /* on IO scheduler merge hash */ #define RQF_HASHED ((__force req_flags_t)(1 << 16)) /* track IO completion time */ #define RQF_STATS ((__force req_flags_t)(1 << 17)) /* Look at ->special_vec for the actual data payload instead of the bio chain. */ #define RQF_SPECIAL_PAYLOAD ((__force req_flags_t)(1 << 18)) /* The per-zone write lock is held for this request */ #define RQF_ZONE_WRITE_LOCKED ((__force req_flags_t)(1 << 19)) /* already slept for hybrid poll */ #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) /* ->timeout has been called, don't expire again */ #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) /* flags that prevent us from merging requests: */ #define RQF_NOMERGE_FLAGS \ (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD) /* * Request state for blk-mq. */ enum mq_rq_state { MQ_RQ_IDLE = 0, MQ_RQ_IN_FLIGHT = 1, MQ_RQ_COMPLETE = 2, }; /* * Try to put the fields that are referenced together in the same cacheline. * * If you modify this structure, make sure to update blk_rq_init() and * especially blk_mq_rq_ctx_init() to take care of the added fields. */ struct request { struct request_queue *q; struct blk_mq_ctx *mq_ctx; struct blk_mq_hw_ctx *mq_hctx; unsigned int cmd_flags; /* op and common flags */ req_flags_t rq_flags; int tag; int internal_tag; /* the following two fields are internal, NEVER access directly */ unsigned int __data_len; /* total data len */ sector_t __sector; /* sector cursor */ struct bio *bio; struct bio *biotail; struct list_head queuelist; /* * The hash is used inside the scheduler, and killed once the * request reaches the dispatch list. The ipi_list is only used * to queue the request for softirq completion, which is long * after the request has been unhashed (and even removed from * the dispatch list). */ union { struct hlist_node hash; /* merge hash */ struct list_head ipi_list; }; /* * The rb_node is only used inside the io scheduler, requests * are pruned when moved to the dispatch queue. So let the * completion_data share space with the rb_node. */ union { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; void *completion_data; int error_count; /* for legacy drivers, don't use */ }; /* * Three pointers are available for the IO schedulers, if they need * more they have to dynamically allocate it. Flush requests are * never put on the IO scheduler. So let the flush fields share * space with the elevator data. */ union { struct { struct io_cq *icq; void *priv[2]; } elv; struct { unsigned int seq; struct list_head list; rq_end_io_fn *saved_end_io; } flush; }; struct gendisk *rq_disk; struct hd_struct *part; #ifdef CONFIG_BLK_RQ_ALLOC_TIME /* Time that the first bio started allocating this request. */ u64 alloc_time_ns; #endif /* Time that this request was allocated for this IO. */ u64 start_time_ns; /* Time that I/O was submitted to the device. */ u64 io_start_time_ns; #ifdef CONFIG_BLK_WBT unsigned short wbt_flags; #endif /* * rq sectors used for blk stats. It has the same value * with blk_rq_sectors(rq), except that it never be zeroed * by completion. */ unsigned short stats_sectors; /* * Number of scatter-gather DMA addr+len pairs after * physical address coalescing is performed. */ unsigned short nr_phys_segments; #if defined(CONFIG_BLK_DEV_INTEGRITY) unsigned short nr_integrity_segments; #endif #ifdef CONFIG_BLK_INLINE_ENCRYPTION struct bio_crypt_ctx *crypt_ctx; struct blk_ksm_keyslot *crypt_keyslot; #endif unsigned short write_hint; unsigned short ioprio; enum mq_rq_state state; refcount_t ref; unsigned int timeout; unsigned long deadline; union { struct __call_single_data csd; u64 fifo_time; }; /* * completion callback. */ rq_end_io_fn *end_io; void *end_io_data; }; static inline bool blk_op_is_scsi(unsigned int op) { return op == REQ_OP_SCSI_IN || op == REQ_OP_SCSI_OUT; } static inline bool blk_op_is_private(unsigned int op) { return op == REQ_OP_DRV_IN || op == REQ_OP_DRV_OUT; } static inline bool blk_rq_is_scsi(struct request *rq) { return blk_op_is_scsi(req_op(rq)); } static inline bool blk_rq_is_private(struct request *rq) { return blk_op_is_private(req_op(rq)); } static inline bool blk_rq_is_passthrough(struct request *rq) { return blk_rq_is_scsi(rq) || blk_rq_is_private(rq); } static inline bool bio_is_passthrough(struct bio *bio) { unsigned op = bio_op(bio); return blk_op_is_scsi(op) || blk_op_is_private(op); } static inline unsigned short req_get_ioprio(struct request *req) { return req->ioprio; } #include <linux/elevator.h> struct blk_queue_ctx; struct bio_vec; enum blk_eh_timer_return { BLK_EH_DONE, /* drivers has completed the command */ BLK_EH_RESET_TIMER, /* reset timer and try again */ }; enum blk_queue_state { Queue_down, Queue_up, }; #define BLK_TAG_ALLOC_FIFO 0 /* allocate starting from 0 */ #define BLK_TAG_ALLOC_RR 1 /* allocate starting from last allocated tag */ #define BLK_SCSI_MAX_CMDS (256) #define BLK_SCSI_CMD_PER_LONG (BLK_SCSI_MAX_CMDS / (sizeof(long) * 8)) /* * Zoned block device models (zoned limit). * * Note: This needs to be ordered from the least to the most severe * restrictions for the inheritance in blk_stack_limits() to work. */ enum blk_zoned_model { BLK_ZONED_NONE = 0, /* Regular block device */ BLK_ZONED_HA, /* Host-aware zoned block device */ BLK_ZONED_HM, /* Host-managed zoned block device */ }; struct queue_limits { unsigned long bounce_pfn; unsigned long seg_boundary_mask; unsigned long virt_boundary_mask; unsigned int max_hw_sectors; unsigned int max_dev_sectors; unsigned int chunk_sectors; unsigned int max_sectors; unsigned int max_segment_size; unsigned int physical_block_size; unsigned int logical_block_size; unsigned int alignment_offset; unsigned int io_min; unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; unsigned int max_write_same_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; unsigned int discard_granularity; unsigned int discard_alignment; unsigned short max_segments; unsigned short max_integrity_segments; unsigned short max_discard_segments; unsigned char misaligned; unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; enum blk_zoned_model zoned; }; typedef int (*report_zones_cb)(struct blk_zone *zone, unsigned int idx, void *data); void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model); #ifdef CONFIG_BLK_DEV_ZONED #define BLK_ALL_ZONES ((unsigned int)-1) int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); unsigned int blkdev_nr_zones(struct gendisk *disk); extern int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)); extern int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); extern int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blkdev_nr_zones(struct gendisk *disk) { return 0; } static inline int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; } static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { return -ENOTTY; } #endif /* CONFIG_BLK_DEV_ZONED */ struct request_queue { struct request *last_merge; struct elevator_queue *elevator; struct percpu_ref q_usage_counter; struct blk_queue_stats *stats; struct rq_qos *rq_qos; const struct blk_mq_ops *mq_ops; /* sw queues */ struct blk_mq_ctx __percpu *queue_ctx; unsigned int queue_depth; /* hw dispatch queues */ struct blk_mq_hw_ctx **queue_hw_ctx; unsigned int nr_hw_queues; struct backing_dev_info *backing_dev_info; /* * The queue owner gets to use this for whatever they like. * ll_rw_blk doesn't touch it. */ void *queuedata; /* * various queue flags, see QUEUE_* below */ unsigned long queue_flags; /* * Number of contexts that have called blk_set_pm_only(). If this * counter is above zero then only RQF_PM requests are processed. */ atomic_t pm_only; /* * ida allocated id for this queue. Used to index queues from * ioctx. */ int id; /* * queue needs bounce pages for pages above this limit */ gfp_t bounce_gfp; spinlock_t queue_lock; /* * queue kobject */ struct kobject kobj; /* * mq queue kobject */ struct kobject *mq_kobj; #ifdef CONFIG_BLK_DEV_INTEGRITY struct blk_integrity integrity; #endif /* CONFIG_BLK_DEV_INTEGRITY */ #ifdef CONFIG_PM struct device *dev; enum rpm_status rpm_status; unsigned int nr_pending; #endif /* * queue settings */ unsigned long nr_requests; /* Max # of requests */ unsigned int dma_pad_mask; unsigned int dma_alignment; #ifdef CONFIG_BLK_INLINE_ENCRYPTION /* Inline crypto capabilities */ struct blk_keyslot_manager *ksm; #endif unsigned int rq_timeout; int poll_nsec; struct blk_stat_callback *poll_cb; struct blk_rq_stat poll_stat[BLK_MQ_POLL_STATS_BKTS]; struct timer_list timeout; struct work_struct timeout_work; atomic_t nr_active_requests_shared_sbitmap; struct list_head icq_list; #ifdef CONFIG_BLK_CGROUP DECLARE_BITMAP (blkcg_pols, BLKCG_MAX_POLS); struct blkcg_gq *root_blkg; struct list_head blkg_list; #endif struct queue_limits limits; unsigned int required_elevator_features; #ifdef CONFIG_BLK_DEV_ZONED /* * Zoned block device information for request dispatch control. * nr_zones is the total number of zones of the device. This is always * 0 for regular block devices. conv_zones_bitmap is a bitmap of nr_zones * bits which indicates if a zone is conventional (bit set) or * sequential (bit clear). seq_zones_wlock is a bitmap of nr_zones * bits which indicates if a zone is write locked, that is, if a write * request targeting the zone was dispatched. All three fields are * initialized by the low level device driver (e.g. scsi/sd.c). * Stacking drivers (device mappers) may or may not initialize * these fields. * * Reads of this information must be protected with blk_queue_enter() / * blk_queue_exit(). Modifying this information is only allowed while * no requests are being processed. See also blk_mq_freeze_queue() and * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int max_open_zones; unsigned int max_active_zones; #endif /* CONFIG_BLK_DEV_ZONED */ /* * sg stuff */ unsigned int sg_timeout; unsigned int sg_reserved_size; int node; struct mutex debugfs_mutex; #ifdef CONFIG_BLK_DEV_IO_TRACE struct blk_trace __rcu *blk_trace; #endif /* * for flush operations */ struct blk_flush_queue *fq; struct list_head requeue_list; spinlock_t requeue_lock; struct delayed_work requeue_work; struct mutex sysfs_lock; struct mutex sysfs_dir_lock; /* * for reusing dead hctx instance in case of updating * nr_hw_queues */ struct list_head unused_hctx_list; spinlock_t unused_hctx_lock; int mq_freeze_depth; #if defined(CONFIG_BLK_DEV_BSG) struct bsg_class_device bsg_dev; #endif #ifdef CONFIG_BLK_DEV_THROTTLING /* Throttle data */ struct throtl_data *td; #endif struct rcu_head rcu_head; wait_queue_head_t mq_freeze_wq; /* * Protect concurrent access to q_usage_counter by * percpu_ref_kill() and percpu_ref_reinit(). */ struct mutex mq_freeze_lock; struct blk_mq_tag_set *tag_set; struct list_head tag_set_list; struct bio_set bio_split; struct dentry *debugfs_dir; #ifdef CONFIG_BLK_DEBUG_FS struct dentry *sched_debugfs_dir; struct dentry *rqos_debugfs_dir; #endif bool mq_sysfs_init_done; size_t cmd_size; #define BLK_MAX_WRITE_HINTS 5 u64 write_hints[BLK_MAX_WRITE_HINTS]; }; /* Keep blk_queue_flag_name[] in sync with the definitions below */ #define QUEUE_FLAG_STOPPED 0 /* queue is stopped */ #define QUEUE_FLAG_DYING 1 /* queue being torn down */ #define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */ #define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */ #define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */ #define QUEUE_FLAG_NONROT 6 /* non-rotational device (SSD) */ #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 7 /* do disk/partitions IO accounting */ #define QUEUE_FLAG_DISCARD 8 /* supports DISCARD */ #define QUEUE_FLAG_NOXMERGES 9 /* No extended merges */ #define QUEUE_FLAG_ADD_RANDOM 10 /* Contributes to random pool */ #define QUEUE_FLAG_SECERASE 11 /* supports secure erase */ #define QUEUE_FLAG_SAME_FORCE 12 /* force complete on same CPU */ #define QUEUE_FLAG_DEAD 13 /* queue tear-down finished */ #define QUEUE_FLAG_INIT_DONE 14 /* queue is initialized */ #define QUEUE_FLAG_STABLE_WRITES 15 /* don't modify blks until WB is done */ #define QUEUE_FLAG_POLL 16 /* IO polling enabled if set */ #define QUEUE_FLAG_WC 17 /* Write back caching */ #define QUEUE_FLAG_FUA 18 /* device supports FUA writes */ #define QUEUE_FLAG_DAX 19 /* device supports DAX */ #define QUEUE_FLAG_STATS 20 /* track IO start and completion times */ #define QUEUE_FLAG_POLL_STATS 21 /* collecting stats for hybrid polling */ #define QUEUE_FLAG_REGISTERED 22 /* queue has been registered to a disk */ #define QUEUE_FLAG_SCSI_PASSTHROUGH 23 /* queue supports SCSI commands */ #define QUEUE_FLAG_QUIESCED 24 /* queue has been quiesced */ #define QUEUE_FLAG_PCI_P2PDMA 25 /* device supports PCI p2p requests */ #define QUEUE_FLAG_ZONE_RESETALL 26 /* supports Zone Reset All */ #define QUEUE_FLAG_RQ_ALLOC_TIME 27 /* record rq->alloc_time_ns */ #define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */ #define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */ #define QUEUE_FLAG_MQ_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_SAME_COMP) | \ (1 << QUEUE_FLAG_NOWAIT)) void blk_queue_flag_set(unsigned int flag, struct request_queue *q); void blk_queue_flag_clear(unsigned int flag, struct request_queue *q); bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q); #define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags) #define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags) #define blk_queue_dead(q) test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags) #define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags) #define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags) #define blk_queue_noxmerges(q) \ test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags) #define blk_queue_nonrot(q) test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags) #define blk_queue_stable_writes(q) \ test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags) #define blk_queue_io_stat(q) test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags) #define blk_queue_add_random(q) test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags) #define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_queue_zone_resetall(q) \ test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags) #define blk_queue_secure_erase(q) \ (test_bit(QUEUE_FLAG_SECERASE, &(q)->queue_flags)) #define blk_queue_dax(q) test_bit(QUEUE_FLAG_DAX, &(q)->queue_flags) #define blk_queue_scsi_passthrough(q) \ test_bit(QUEUE_FLAG_SCSI_PASSTHROUGH, &(q)->queue_flags) #define blk_queue_pci_p2pdma(q) \ test_bit(QUEUE_FLAG_PCI_P2PDMA, &(q)->queue_flags) #ifdef CONFIG_BLK_RQ_ALLOC_TIME #define blk_queue_rq_alloc_time(q) \ test_bit(QUEUE_FLAG_RQ_ALLOC_TIME, &(q)->queue_flags) #else #define blk_queue_rq_alloc_time(q) false #endif #define blk_noretry_request(rq) \ ((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \ REQ_FAILFAST_DRIVER)) #define blk_queue_quiesced(q) test_bit(QUEUE_FLAG_QUIESCED, &(q)->queue_flags) #define blk_queue_pm_only(q) atomic_read(&(q)->pm_only) #define blk_queue_fua(q) test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags) #define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags) #define blk_queue_nowait(q) test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags) extern void blk_set_pm_only(struct request_queue *q); extern void blk_clear_pm_only(struct request_queue *q); static inline bool blk_account_rq(struct request *rq) { return (rq->rq_flags & RQF_STARTED) && !blk_rq_is_passthrough(rq); } #define list_entry_rq(ptr) list_entry((ptr), struct request, queuelist) #define rq_data_dir(rq) (op_is_write(req_op(rq)) ? WRITE : READ) #define rq_dma_dir(rq) \ (op_is_write(req_op(rq)) ? DMA_TO_DEVICE : DMA_FROM_DEVICE) #define dma_map_bvec(dev, bv, dir, attrs) \ dma_map_page_attrs(dev, (bv)->bv_page, (bv)->bv_offset, (bv)->bv_len, \ (dir), (attrs)) static inline bool queue_is_mq(struct request_queue *q) { return q->mq_ops; } #ifdef CONFIG_PM static inline enum rpm_status queue_rpm_status(struct request_queue *q) { return q->rpm_status; } #else static inline enum rpm_status queue_rpm_status(struct request_queue *q) { return RPM_ACTIVE; } #endif static inline enum blk_zoned_model blk_queue_zoned_model(struct request_queue *q) { if (IS_ENABLED(CONFIG_BLK_DEV_ZONED)) return q->limits.zoned; return BLK_ZONED_NONE; } static inline bool blk_queue_is_zoned(struct request_queue *q) { switch (blk_queue_zoned_model(q)) { case BLK_ZONED_HA: case BLK_ZONED_HM: return true; default: return false; } } static inline sector_t blk_queue_zone_sectors(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->limits.chunk_sectors : 0; } #ifdef CONFIG_BLK_DEV_ZONED static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { return blk_queue_is_zoned(q) ? q->nr_zones : 0; } static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { if (!blk_queue_is_zoned(q)) return 0; return sector >> ilog2(q->limits.chunk_sectors); } static inline bool blk_queue_zone_is_seq(struct request_queue *q, sector_t sector) { if (!blk_queue_is_zoned(q)) return false; if (!q->conv_zones_bitmap) return true; return !test_bit(blk_queue_zone_no(q, sector), q->conv_zones_bitmap); } static inline void blk_queue_max_open_zones(struct request_queue *q, unsigned int max_open_zones) { q->max_open_zones = max_open_zones; } static inline unsigned int queue_max_open_zones(const struct request_queue *q) { return q->max_open_zones; } static inline void blk_queue_max_active_zones(struct request_queue *q, unsigned int max_active_zones) { q->max_active_zones = max_active_zones; } static inline unsigned int queue_max_active_zones(const struct request_queue *q) { return q->max_active_zones; } #else /* CONFIG_BLK_DEV_ZONED */ static inline unsigned int blk_queue_nr_zones(struct request_queue *q) { return 0; } static inline bool blk_queue_zone_is_seq(struct request_queue *q, sector_t sector) { return false; } static inline unsigned int blk_queue_zone_no(struct request_queue *q, sector_t sector) { return 0; } static inline unsigned int queue_max_open_zones(const struct request_queue *q) { return 0; } static inline unsigned int queue_max_active_zones(const struct request_queue *q) { return 0; } #endif /* CONFIG_BLK_DEV_ZONED */ static inline bool rq_is_sync(struct request *rq) { return op_is_sync(rq->cmd_flags); } static inline bool rq_mergeable(struct request *rq) { if (blk_rq_is_passthrough(rq)) return false; if (req_op(rq) == REQ_OP_FLUSH) return false; if (req_op(rq) == REQ_OP_WRITE_ZEROES) return false; if (req_op(rq) == REQ_OP_ZONE_APPEND) return false; if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; if (rq->rq_flags & RQF_NOMERGE_FLAGS) return false; return true; } static inline bool blk_write_same_mergeable(struct bio *a, struct bio *b) { if (bio_page(a) == bio_page(b) && bio_offset(a) == bio_offset(b)) return true; return false; } static inline unsigned int blk_queue_depth(struct request_queue *q) { if (q->queue_depth) return q->queue_depth; return q->nr_requests; } extern unsigned long blk_max_low_pfn, blk_max_pfn; /* * standard bounce addresses: * * BLK_BOUNCE_HIGH : bounce all highmem pages * BLK_BOUNCE_ANY : don't bounce anything * BLK_BOUNCE_ISA : bounce pages above ISA DMA boundary */ #if BITS_PER_LONG == 32 #define BLK_BOUNCE_HIGH ((u64)blk_max_low_pfn << PAGE_SHIFT) #else #define BLK_BOUNCE_HIGH -1ULL #endif #define BLK_BOUNCE_ANY (-1ULL) #define BLK_BOUNCE_ISA (DMA_BIT_MASK(24)) /* * default timeout for SG_IO if none specified */ #define BLK_DEFAULT_SG_TIMEOUT (60 * HZ) #define BLK_MIN_SG_TIMEOUT (7 * HZ) struct rq_map_data { struct page **pages; int page_order; int nr_entries; unsigned long offset; int null_mapped; int from_user; }; struct req_iterator { struct bvec_iter iter; struct bio *bio; }; /* This should not be used directly - use rq_for_each_segment */ #define for_each_bio(_bio) \ for (; _bio; _bio = _bio->bi_next) #define __rq_for_each_bio(_bio, rq) \ if ((rq->bio)) \ for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next) #define rq_for_each_segment(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_segment(bvl, _iter.bio, _iter.iter) #define rq_for_each_bvec(bvl, _rq, _iter) \ __rq_for_each_bio(_iter.bio, _rq) \ bio_for_each_bvec(bvl, _iter.bio, _iter.iter) #define rq_iter_last(bvec, _iter) \ (_iter.bio->bi_next == NULL && \ bio_iter_last(bvec, _iter.iter)) #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE # error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" #endif #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE extern void rq_flush_dcache_pages(struct request *rq); #else static inline void rq_flush_dcache_pages(struct request *rq) { } #endif extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); blk_qc_t submit_bio_noacct(struct bio *bio); extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern struct request *blk_get_request(struct request_queue *, unsigned int op, blk_mq_req_flags_t flags); extern int blk_lld_busy(struct request_queue *q); extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data); extern void blk_rq_unprep_clone(struct request *rq); extern blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq); extern int blk_rq_append_bio(struct request *rq, struct bio **bio); extern void blk_queue_split(struct bio **); extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int); extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t, unsigned int, void __user *); extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, unsigned int, void __user *); extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); extern int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp); extern int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp); extern int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags); extern void blk_queue_exit(struct request_queue *q); extern void blk_sync_queue(struct request_queue *q); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); extern int blk_rq_unmap_user(struct bio *); extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t); extern int blk_rq_map_user_iov(struct request_queue *, struct request *, struct rq_map_data *, const struct iov_iter *, gfp_t); extern void blk_execute_rq(struct request_queue *, struct gendisk *, struct request *, int); extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *, struct request *, int, rq_end_io_fn *); /* Helper to convert REQ_OP_XXX to its string format XXX */ extern const char *blk_op_str(unsigned int op); int blk_status_to_errno(blk_status_t status); blk_status_t errno_to_blk_status(int errno); int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin); static inline struct request_queue *bdev_get_queue(struct block_device *bdev) { return bdev->bd_disk->queue; /* this is never NULL */ } /* * The basic unit of block I/O is a sector. It is used in a number of contexts * in Linux (blk, bio, genhd). The size of one sector is 512 = 2**9 * bytes. Variables of type sector_t represent an offset or size that is a * multiple of 512 bytes. Hence these two constants. */ #ifndef SECTOR_SHIFT #define SECTOR_SHIFT 9 #endif #ifndef SECTOR_SIZE #define SECTOR_SIZE (1 << SECTOR_SHIFT) #endif /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request * blk_rq_cur_bytes() : bytes left in the current segment * blk_rq_err_bytes() : bytes left till the next error boundary * blk_rq_sectors() : sectors left in the entire request * blk_rq_cur_sectors() : sectors left in the current segment * blk_rq_stats_sectors() : sectors of the entire request used for stats */ static inline sector_t blk_rq_pos(const struct request *rq) { return rq->__sector; } static inline unsigned int blk_rq_bytes(const struct request *rq) { return rq->__data_len; } static inline int blk_rq_cur_bytes(const struct request *rq) { return rq->bio ? bio_cur_bytes(rq->bio) : 0; } extern unsigned int blk_rq_err_bytes(const struct request *rq); static inline unsigned int blk_rq_sectors(const struct request *rq) { return blk_rq_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_cur_sectors(const struct request *rq) { return blk_rq_cur_bytes(rq) >> SECTOR_SHIFT; } static inline unsigned int blk_rq_stats_sectors(const struct request *rq) { return rq->stats_sectors; } #ifdef CONFIG_BLK_DEV_ZONED /* Helper to convert BLK_ZONE_ZONE_XXX to its string format XXX */ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond); static inline unsigned int blk_rq_zone_no(struct request *rq) { return blk_queue_zone_no(rq->q, blk_rq_pos(rq)); } static inline unsigned int blk_rq_zone_is_seq(struct request *rq) { return blk_queue_zone_is_seq(rq->q, blk_rq_pos(rq)); } #endif /* CONFIG_BLK_DEV_ZONED */ /* * Some commands like WRITE SAME have a payload or data transfer size which * is different from the size of the request. Any driver that supports such * commands using the RQF_SPECIAL_PAYLOAD flag needs to use this helper to * calculate the data transfer size. */ static inline unsigned int blk_rq_payload_bytes(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec.bv_len; return blk_rq_bytes(rq); } /* * Return the first full biovec in the request. The caller needs to check that * there are any bvecs before calling this helper. */ static inline struct bio_vec req_bvec(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return rq->special_vec; return mp_bvec_iter_bvec(rq->bio->bi_io_vec, rq->bio->bi_iter); } static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, int op) { if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) return min(q->limits.max_discard_sectors, UINT_MAX >> SECTOR_SHIFT); if (unlikely(op == REQ_OP_WRITE_SAME)) return q->limits.max_write_same_sectors; if (unlikely(op == REQ_OP_WRITE_ZEROES)) return q->limits.max_write_zeroes_sectors; return q->limits.max_sectors; } /* * Return maximum size of a request at given offset. Only valid for * file system requests. */ static inline unsigned int blk_max_size_offset(struct request_queue *q, sector_t offset, unsigned int chunk_sectors) { if (!chunk_sectors) { if (q->limits.chunk_sectors) chunk_sectors = q->limits.chunk_sectors; else return q->limits.max_sectors; } if (likely(is_power_of_2(chunk_sectors))) chunk_sectors -= offset & (chunk_sectors - 1); else chunk_sectors -= sector_div(offset, chunk_sectors); return min(q->limits.max_sectors, chunk_sectors); } static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { struct request_queue *q = rq->q; if (blk_rq_is_passthrough(rq)) return q->limits.max_hw_sectors; if (!q->limits.chunk_sectors || req_op(rq) == REQ_OP_DISCARD || req_op(rq) == REQ_OP_SECURE_ERASE) return blk_queue_get_max_sectors(q, req_op(rq)); return min(blk_max_size_offset(q, offset, 0), blk_queue_get_max_sectors(q, req_op(rq))); } static inline unsigned int blk_rq_count_bios(struct request *rq) { unsigned int nr_bios = 0; struct bio *bio; __rq_for_each_bio(bio, rq) nr_bios++; return nr_bios; } void blk_steal_bios(struct bio_list *list, struct request *rq); /* * Request completion related functions. * * blk_update_request() completes given number of bytes and updates * the request without completing it. */ extern bool blk_update_request(struct request *rq, blk_status_t error, unsigned int nr_bytes); extern void blk_abort_request(struct request *); /* * Access functions for manipulating queue properties */ extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_chunk_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_segments(struct request_queue *, unsigned short); extern void blk_queue_max_discard_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors); extern void blk_queue_max_write_same_sectors(struct request_queue *q, unsigned int max_write_same_sectors); extern void blk_queue_max_write_zeroes_sectors(struct request_queue *q, unsigned int max_write_same_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned int); extern void blk_queue_max_zone_append_sectors(struct request_queue *q, unsigned int max_zone_append_sectors); extern void blk_queue_physical_block_size(struct request_queue *, unsigned int); extern void blk_queue_alignment_offset(struct request_queue *q, unsigned int alignment); void blk_queue_update_readahead(struct request_queue *q); extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min); extern void blk_queue_io_min(struct request_queue *q, unsigned int min); extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt); extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_default_limits(struct queue_limits *lim); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_write_cache(struct request_queue *q, bool enabled, bool fua); extern void blk_queue_required_elevator_features(struct request_queue *q, unsigned int features); extern bool blk_queue_can_use_dma_map_merging(struct request_queue *q, struct device *dev); /* * Number of physical segments as sent to the device. * * Normally this is the number of discontiguous data segments sent by the * submitter. But for data-less command like discard we might have no * actual data segments submitted, but the driver might have to add it's * own special payload. In that case we still return 1 here so that this * special payload will be mapped. */ static inline unsigned short blk_rq_nr_phys_segments(struct request *rq) { if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) return 1; return rq->nr_phys_segments; } /* * Number of discard segments (or ranges) the driver needs to fill in. * Each discard bio merged into a request is counted as one segment. */ static inline unsigned short blk_rq_nr_discard_segments(struct request *rq) { return max_t(unsigned short, rq->nr_phys_segments, 1); } int __blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist, struct scatterlist **last_sg); static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq, struct scatterlist *sglist) { struct scatterlist *last_sg = NULL; return __blk_rq_map_sg(q, rq, sglist, &last_sg); } extern void blk_dump_rq_flags(struct request *, char *); bool __must_check blk_get_queue(struct request_queue *); struct request_queue *blk_alloc_queue(int node_id); extern void blk_put_queue(struct request_queue *); extern void blk_set_queue_dying(struct request_queue *); #ifdef CONFIG_BLOCK /* * blk_plug permits building a queue of related requests by holding the I/O * fragments for a short period. This allows merging of sequential requests * into single larger request. As the requests are moved from a per-task list to * the device's request_queue in a batch, this results in improved scalability * as the lock contention for request_queue lock is reduced. * * It is ok not to disable preemption when adding the request to the plug list * or when attempting a merge, because blk_schedule_flush_list() will only flush * the plug list when the task sleeps by itself. For details, please see * schedule() where blk_schedule_flush_plug() is called. */ struct blk_plug { struct list_head mq_list; /* blk-mq requests */ struct list_head cb_list; /* md requires an unplug callback */ unsigned short rq_count; bool multiple_queues; bool nowait; }; struct blk_plug_cb; typedef void (*blk_plug_cb_fn)(struct blk_plug_cb *, bool); struct blk_plug_cb { struct list_head list; blk_plug_cb_fn callback; void *data; }; extern struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size); extern void blk_start_plug(struct blk_plug *); extern void blk_finish_plug(struct blk_plug *); extern void blk_flush_plug_list(struct blk_plug *, bool); static inline void blk_flush_plug(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; if (plug) blk_flush_plug_list(plug, false); } static inline void blk_schedule_flush_plug(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; if (plug) blk_flush_plug_list(plug, true); } static inline bool blk_needs_flush_plug(struct task_struct *tsk) { struct blk_plug *plug = tsk->plug; return plug && (!list_empty(&plug->mq_list) || !list_empty(&plug->cb_list)); } int blkdev_issue_flush(struct block_device *, gfp_t); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ struct blk_plug { }; static inline void blk_start_plug(struct blk_plug *plug) { } static inline void blk_finish_plug(struct blk_plug *plug) { } static inline void blk_flush_plug(struct task_struct *task) { } static inline void blk_schedule_flush_plug(struct task_struct *task) { } static inline bool blk_needs_flush_plug(struct task_struct *tsk) { return false; } static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) { return 0; } static inline long nr_blockdev_pages(void) { return 0; } #endif /* CONFIG_BLOCK */ extern void blk_io_schedule(void); extern int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct page *page); #define BLKDEV_DISCARD_SECURE (1 << 0) /* issue a secure erase */ extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned long flags); extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, int flags, struct bio **biop); #define BLKDEV_ZERO_NOUNMAP (1 << 0) /* do not free blocks */ #define BLKDEV_ZERO_NOFALLBACK (1 << 1) /* don't write explicit zeroes */ extern int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags); extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, unsigned flags); static inline int sb_issue_discard(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags) { return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), gfp_mask, flags); } static inline int sb_issue_zeroout(struct super_block *sb, sector_t block, sector_t nr_blocks, gfp_t gfp_mask) { return blkdev_issue_zeroout(sb->s_bdev, block << (sb->s_blocksize_bits - SECTOR_SHIFT), nr_blocks << (sb->s_blocksize_bits - SECTOR_SHIFT), gfp_mask, 0); } extern int blk_verify_command(unsigned char *cmd, fmode_t mode); static inline bool bdev_is_partition(struct block_device *bdev) { return bdev->bd_partno; } enum blk_default_limits { BLK_MAX_SEGMENTS = 128, BLK_SAFE_MAX_SECTORS = 255, BLK_DEF_MAX_SECTORS = 2560, BLK_MAX_SEGMENT_SIZE = 65536, BLK_SEG_BOUNDARY_MASK = 0xFFFFFFFFUL, }; static inline unsigned long queue_segment_boundary(const struct request_queue *q) { return q->limits.seg_boundary_mask; } static inline unsigned long queue_virt_boundary(const struct request_queue *q) { return q->limits.virt_boundary_mask; } static inline unsigned int queue_max_sectors(const struct request_queue *q) { return q->limits.max_sectors; } static inline unsigned int queue_max_hw_sectors(const struct request_queue *q) { return q->limits.max_hw_sectors; } static inline unsigned short queue_max_segments(const struct request_queue *q) { return q->limits.max_segments; } static inline unsigned short queue_max_discard_segments(const struct request_queue *q) { return q->limits.max_discard_segments; } static inline unsigned int queue_max_segment_size(const struct request_queue *q) { return q->limits.max_segment_size; } static inline unsigned int queue_max_zone_append_sectors(const struct request_queue *q) { const struct queue_limits *l = &q->limits; return min(l->max_zone_append_sectors, l->max_sectors); } static inline unsigned queue_logical_block_size(const struct request_queue *q) { int retval = 512; if (q && q->limits.logical_block_size) retval = q->limits.logical_block_size; return retval; } static inline unsigned int bdev_logical_block_size(struct block_device *bdev) { return queue_logical_block_size(bdev_get_queue(bdev)); } static inline unsigned int queue_physical_block_size(const struct request_queue *q) { return q->limits.physical_block_size; } static inline unsigned int bdev_physical_block_size(struct block_device *bdev) { return queue_physical_block_size(bdev_get_queue(bdev)); } static inline unsigned int queue_io_min(const struct request_queue *q) { return q->limits.io_min; } static inline int bdev_io_min(struct block_device *bdev) { return queue_io_min(bdev_get_queue(bdev)); } static inline unsigned int queue_io_opt(const struct request_queue *q) { return q->limits.io_opt; } static inline int bdev_io_opt(struct block_device *bdev) { return queue_io_opt(bdev_get_queue(bdev)); } static inline int queue_alignment_offset(const struct request_queue *q) { if (q->limits.misaligned) return -1; return q->limits.alignment_offset; } static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector) { unsigned int granularity = max(lim->physical_block_size, lim->io_min); unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) << SECTOR_SHIFT; return (granularity + lim->alignment_offset - alignment) % granularity; } static inline int bdev_alignment_offset(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q->limits.misaligned) return -1; if (bdev_is_partition(bdev)) return queue_limit_alignment_offset(&q->limits, bdev->bd_part->start_sect); return q->limits.alignment_offset; } static inline int queue_discard_alignment(const struct request_queue *q) { if (q->limits.discard_misaligned) return -1; return q->limits.discard_alignment; } static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector) { unsigned int alignment, granularity, offset; if (!lim->max_discard_sectors) return 0; /* Why are these in bytes, not sectors? */ alignment = lim->discard_alignment >> SECTOR_SHIFT; granularity = lim->discard_granularity >> SECTOR_SHIFT; if (!granularity) return 0; /* Offset of the partition start in 'granularity' sectors */ offset = sector_div(sector, granularity); /* And why do we do this modulus *again* in blkdev_issue_discard()? */ offset = (granularity + alignment - offset) % granularity; /* Turn it back into bytes, gaah */ return offset << SECTOR_SHIFT; } /* * Two cases of handling DISCARD merge: * If max_discard_segments > 1, the driver takes every bio * as a range and send them to controller together. The ranges * needn't to be contiguous. * Otherwise, the bios/requests will be handled as same as * others which should be contiguous. */ static inline bool blk_discard_mergable(struct request *req) { if (req_op(req) == REQ_OP_DISCARD && queue_max_discard_segments(req->q) > 1) return true; return false; } static inline int bdev_discard_alignment(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (bdev_is_partition(bdev)) return queue_limit_discard_alignment(&q->limits, bdev->bd_part->start_sect); return q->limits.discard_alignment; } static inline unsigned int bdev_write_same(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) return q->limits.max_write_same_sectors; return 0; } static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) return q->limits.max_write_zeroes_sectors; return 0; } static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) return blk_queue_zoned_model(q); return BLK_ZONED_NONE; } static inline bool bdev_is_zoned(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) return blk_queue_is_zoned(q); return false; } static inline sector_t bdev_zone_sectors(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) return blk_queue_zone_sectors(q); return 0; } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) return queue_max_open_zones(q); return 0; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); if (q) return queue_max_active_zones(q); return 0; } static inline int queue_dma_alignment(const struct request_queue *q) { return q ? q->dma_alignment : 511; } static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr, unsigned int len) { unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask; return !(addr & alignment) && !(len & alignment); } /* assumes size > 256 */ static inline unsigned int blksize_bits(unsigned int size) { unsigned int bits = 8; do { bits++; size >>= 1; } while (size > 256); return bits; } static inline unsigned int block_size(struct block_device *bdev) { return 1 << bdev->bd_inode->i_blkbits; } int kblockd_schedule_work(struct work_struct *work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \ MODULE_ALIAS("block-major-" __stringify(major) "-*") #if defined(CONFIG_BLK_DEV_INTEGRITY) enum blk_integrity_flags { BLK_INTEGRITY_VERIFY = 1 << 0, BLK_INTEGRITY_GENERATE = 1 << 1, BLK_INTEGRITY_DEVICE_CAPABLE = 1 << 2, BLK_INTEGRITY_IP_CHECKSUM = 1 << 3, }; struct blk_integrity_iter { void *prot_buf; void *data_buf; sector_t seed; unsigned int data_size; unsigned short interval; const char *disk_name; }; typedef blk_status_t (integrity_processing_fn) (struct blk_integrity_iter *); typedef void (integrity_prepare_fn) (struct request *); typedef void (integrity_complete_fn) (struct request *, unsigned int); struct blk_integrity_profile { integrity_processing_fn *generate_fn; integrity_processing_fn *verify_fn; integrity_prepare_fn *prepare_fn; integrity_complete_fn *complete_fn; const char *name; }; extern void blk_integrity_register(struct gendisk *, struct blk_integrity *); extern void blk_integrity_unregister(struct gendisk *); extern int blk_integrity_compare(struct gendisk *, struct gendisk *); extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *, struct scatterlist *); extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *); static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { struct blk_integrity *bi = &disk->queue->integrity; if (!bi->profile) return NULL; return bi; } static inline struct blk_integrity *bdev_get_integrity(struct block_device *bdev) { return blk_get_integrity(bdev->bd_disk); } static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { return q->integrity.profile; } static inline bool blk_integrity_rq(struct request *rq) { return rq->cmd_flags & REQ_INTEGRITY; } static inline void blk_queue_max_integrity_segments(struct request_queue *q, unsigned int segs) { q->limits.max_integrity_segments = segs; } static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { return q->limits.max_integrity_segments; } /** * bio_integrity_intervals - Return number of integrity intervals for a bio * @bi: blk_integrity profile for device * @sectors: Size of the bio in 512-byte sectors * * Description: The block layer calculates everything in 512 byte * sectors but integrity metadata is done in terms of the data integrity * interval size of the storage device. Convert the block layer sectors * to the appropriate number of integrity intervals. */ static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { return sectors >> (bi->interval_exp - 9); } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, unsigned int sectors) { return bio_integrity_intervals(bi, sectors) * bi->tuple_size; } /* * Return the first bvec that contains integrity data. Only drivers that are * limited to a single integrity segment should use this helper. */ static inline struct bio_vec *rq_integrity_vec(struct request *rq) { if (WARN_ON_ONCE(queue_max_integrity_segments(rq->q) > 1)) return NULL; return rq->bio->bi_integrity->bip_vec; } #else /* CONFIG_BLK_DEV_INTEGRITY */ struct bio; struct block_device; struct gendisk; struct blk_integrity; static inline int blk_integrity_rq(struct request *rq) { return 0; } static inline int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *b) { return 0; } static inline int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *b, struct scatterlist *s) { return 0; } static inline struct blk_integrity *bdev_get_integrity(struct block_device *b) { return NULL; } static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk) { return NULL; } static inline bool blk_integrity_queue_supports_integrity(struct request_queue *q) { return false; } static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b) { return 0; } static inline void blk_integrity_register(struct gendisk *d, struct blk_integrity *b) { } static inline void blk_integrity_unregister(struct gendisk *d) { } static inline void blk_queue_max_integrity_segments(struct request_queue *q, unsigned int segs) { } static inline unsigned short queue_max_integrity_segments(const struct request_queue *q) { return 0; } static inline unsigned int bio_integrity_intervals(struct blk_integrity *bi, unsigned int sectors) { return 0; } static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi, unsigned int sectors) { return 0; } static inline struct bio_vec *rq_integrity_vec(struct request *rq) { return NULL; } #endif /* CONFIG_BLK_DEV_INTEGRITY */ #ifdef CONFIG_BLK_INLINE_ENCRYPTION bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q); void blk_ksm_unregister(struct request_queue *q); #else /* CONFIG_BLK_INLINE_ENCRYPTION */ static inline bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) { return true; } static inline void blk_ksm_unregister(struct request_queue *q) { } #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ struct block_device_operations { blk_qc_t (*submit_bio) (struct bio *bio); int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); unsigned int (*check_events) (struct gendisk *disk, unsigned int clearing); void (*unlock_native_capacity) (struct gendisk *); int (*revalidate_disk) (struct gendisk *); int (*getgeo)(struct block_device *, struct hd_geometry *); /* this callback is with swap_lock and sometimes page table lock held */ void (*swap_slot_free_notify) (struct block_device *, unsigned long); int (*report_zones)(struct gendisk *, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); char *(*devnode)(struct gendisk *disk, umode_t *mode); struct module *owner; const struct pr_ops *pr_ops; }; #ifdef CONFIG_COMPAT extern int blkdev_compat_ptr_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long); #else #define blkdev_compat_ptr_ioctl NULL #endif extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long); extern int bdev_read_page(struct block_device *, sector_t, struct page *); extern int bdev_write_page(struct block_device *, sector_t, struct page *, struct writeback_control *); #ifdef CONFIG_BLK_DEV_ZONED bool blk_req_needs_zone_write_lock(struct request *rq); bool blk_req_zone_write_trylock(struct request *rq); void __blk_req_zone_write_lock(struct request *rq); void __blk_req_zone_write_unlock(struct request *rq); static inline void blk_req_zone_write_lock(struct request *rq) { if (blk_req_needs_zone_write_lock(rq)) __blk_req_zone_write_lock(rq); } static inline void blk_req_zone_write_unlock(struct request *rq) { if (rq->rq_flags & RQF_ZONE_WRITE_LOCKED) __blk_req_zone_write_unlock(rq); } static inline bool blk_req_zone_is_write_locked(struct request *rq) { return rq->q->seq_zones_wlock && test_bit(blk_rq_zone_no(rq), rq->q->seq_zones_wlock); } static inline bool blk_req_can_dispatch_to_zone(struct request *rq) { if (!blk_req_needs_zone_write_lock(rq)) return true; return !blk_req_zone_is_write_locked(rq); } #else static inline bool blk_req_needs_zone_write_lock(struct request *rq) { return false; } static inline void blk_req_zone_write_lock(struct request *rq) { } static inline void blk_req_zone_write_unlock(struct request *rq) { } static inline bool blk_req_zone_is_write_locked(struct request *rq) { return false; } static inline bool blk_req_can_dispatch_to_zone(struct request *rq) { return true; } #endif /* CONFIG_BLK_DEV_ZONED */ static inline void blk_wake_io_task(struct task_struct *waiter) { /* * If we're polling, the task itself is doing the completions. For * that case, we don't need to signal a wakeup, it's enough to just * mark us as RUNNING. */ if (waiter == current) __set_current_state(TASK_RUNNING); else wake_up_process(waiter); } unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time); unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, struct bio *bio); void part_end_io_acct(struct hd_struct *part, struct bio *bio, unsigned long start_time); /** * bio_start_io_acct - start I/O accounting for bio based drivers * @bio: bio to start account for * * Returns the start time that should be passed back to bio_end_io_acct(). */ static inline unsigned long bio_start_io_acct(struct bio *bio) { return disk_start_io_acct(bio->bi_disk, bio_sectors(bio), bio_op(bio)); } /** * bio_end_io_acct - end I/O accounting for bio based drivers * @bio: bio to end account for * @start: start time returned by bio_start_io_acct() */ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) { return disk_end_io_acct(bio->bi_disk, bio_op(bio), start_time); } int bdev_read_only(struct block_device *bdev); int set_blocksize(struct block_device *bdev, int size); const char *bdevname(struct block_device *bdev, char *buffer); struct block_device *lookup_bdev(const char *); void blkdev_show(struct seq_file *seqf, off_t offset); #define BDEVNAME_SIZE 32 /* Largest string for a blockdev identifier */ #define BDEVT_SIZE 10 /* Largest string for MAJ:MIN for blkdev */ #ifdef CONFIG_BLOCK #define BLKDEV_MAJOR_MAX 512 #else #define BLKDEV_MAJOR_MAX 0 #endif struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder); struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder); int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, void *holder); void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, void *holder); void blkdev_put(struct block_device *bdev, fmode_t mode); struct block_device *I_BDEV(struct inode *inode); struct block_device *bdget_part(struct hd_struct *part); struct block_device *bdgrab(struct block_device *bdev); void bdput(struct block_device *); #ifdef CONFIG_BLOCK void invalidate_bdev(struct block_device *bdev); int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend); int sync_blockdev(struct block_device *bdev); #else static inline void invalidate_bdev(struct block_device *bdev) { } static inline int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend) { return 0; } static inline int sync_blockdev(struct block_device *bdev) { return 0; } #endif int fsync_bdev(struct block_device *bdev); struct super_block *freeze_bdev(struct block_device *bdev); int thaw_bdev(struct block_device *bdev, struct super_block *sb); #endif /* _LINUX_BLKDEV_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 /* BlueZ - Bluetooth protocol stack for Linux Copyright (C) 2000-2001 Qualcomm Incorporated Written 2000,2001 by Maxim Krasnyansky <maxk@qualcomm.com> This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License version 2 as published by the Free Software Foundation; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) AND AUTHOR(S) BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. ALL LIABILITY, INCLUDING LIABILITY FOR INFRINGEMENT OF ANY PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS, RELATING TO USE OF THIS SOFTWARE IS DISCLAIMED. */ #ifndef __BLUETOOTH_H #define __BLUETOOTH_H #include <linux/poll.h> #include <net/sock.h> #include <linux/seq_file.h> #define BT_SUBSYS_VERSION 2 #define BT_SUBSYS_REVISION 22 #ifndef AF_BLUETOOTH #define AF_BLUETOOTH 31 #define PF_BLUETOOTH AF_BLUETOOTH #endif /* Bluetooth versions */ #define BLUETOOTH_VER_1_1 1 #define BLUETOOTH_VER_1_2 2 #define BLUETOOTH_VER_2_0 3 #define BLUETOOTH_VER_2_1 4 #define BLUETOOTH_VER_4_0 6 /* Reserv for core and drivers use */ #define BT_SKB_RESERVE 8 #define BTPROTO_L2CAP 0 #define BTPROTO_HCI 1 #define BTPROTO_SCO 2 #define BTPROTO_RFCOMM 3 #define BTPROTO_BNEP 4 #define BTPROTO_CMTP 5 #define BTPROTO_HIDP 6 #define BTPROTO_AVDTP 7 #define SOL_HCI 0 #define SOL_L2CAP 6 #define SOL_SCO 17 #define SOL_RFCOMM 18 #define BT_SECURITY 4 struct bt_security { __u8 level; __u8 key_size; }; #define BT_SECURITY_SDP 0 #define BT_SECURITY_LOW 1 #define BT_SECURITY_MEDIUM 2 #define BT_SECURITY_HIGH 3 #define BT_SECURITY_FIPS 4 #define BT_DEFER_SETUP 7 #define BT_FLUSHABLE 8 #define BT_FLUSHABLE_OFF 0 #define BT_FLUSHABLE_ON 1 #define BT_POWER 9 struct bt_power { __u8 force_active; }; #define BT_POWER_FORCE_ACTIVE_OFF 0 #define BT_POWER_FORCE_ACTIVE_ON 1 #define BT_CHANNEL_POLICY 10 /* BR/EDR only (default policy) * AMP controllers cannot be used. * Channel move requests from the remote device are denied. * If the L2CAP channel is currently using AMP, move the channel to BR/EDR. */ #define BT_CHANNEL_POLICY_BREDR_ONLY 0 /* BR/EDR Preferred * Allow use of AMP controllers. * If the L2CAP channel is currently on AMP, move it to BR/EDR. * Channel move requests from the remote device are allowed. */ #define BT_CHANNEL_POLICY_BREDR_PREFERRED 1 /* AMP Preferred * Allow use of AMP controllers * If the L2CAP channel is currently on BR/EDR and AMP controller * resources are available, initiate a channel move to AMP. * Channel move requests from the remote device are allowed. * If the L2CAP socket has not been connected yet, try to create * and configure the channel directly on an AMP controller rather * than BR/EDR. */ #define BT_CHANNEL_POLICY_AMP_PREFERRED 2 #define BT_VOICE 11 struct bt_voice { __u16 setting; }; #define BT_VOICE_TRANSPARENT 0x0003 #define BT_VOICE_CVSD_16BIT 0x0060 #define BT_SNDMTU 12 #define BT_RCVMTU 13 #define BT_PHY 14 #define BT_PHY_BR_1M_1SLOT 0x00000001 #define BT_PHY_BR_1M_3SLOT 0x00000002 #define BT_PHY_BR_1M_5SLOT 0x00000004 #define BT_PHY_EDR_2M_1SLOT 0x00000008 #define BT_PHY_EDR_2M_3SLOT 0x00000010 #define BT_PHY_EDR_2M_5SLOT 0x00000020 #define BT_PHY_EDR_3M_1SLOT 0x00000040 #define BT_PHY_EDR_3M_3SLOT 0x00000080 #define BT_PHY_EDR_3M_5SLOT 0x00000100 #define BT_PHY_LE_1M_TX 0x00000200 #define BT_PHY_LE_1M_RX 0x00000400 #define BT_PHY_LE_2M_TX 0x00000800 #define BT_PHY_LE_2M_RX 0x00001000 #define BT_PHY_LE_CODED_TX 0x00002000 #define BT_PHY_LE_CODED_RX 0x00004000 #define BT_MODE 15 #define BT_MODE_BASIC 0x00 #define BT_MODE_ERTM 0x01 #define BT_MODE_STREAMING 0x02 #define BT_MODE_LE_FLOWCTL 0x03 #define BT_MODE_EXT_FLOWCTL 0x04 #define BT_PKT_STATUS 16 #define BT_SCM_PKT_STATUS 0x03 __printf(1, 2) void bt_info(const char *fmt, ...); __printf(1, 2) void bt_warn(const char *fmt, ...); __printf(1, 2) void bt_err(const char *fmt, ...); #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) void bt_dbg_set(bool enable); bool bt_dbg_get(void); __printf(1, 2) void bt_dbg(const char *fmt, ...); #endif __printf(1, 2) void bt_warn_ratelimited(const char *fmt, ...); __printf(1, 2) void bt_err_ratelimited(const char *fmt, ...); #define BT_INFO(fmt, ...) bt_info(fmt "\n", ##__VA_ARGS__) #define BT_WARN(fmt, ...) bt_warn(fmt "\n", ##__VA_ARGS__) #define BT_ERR(fmt, ...) bt_err(fmt "\n", ##__VA_ARGS__) #if IS_ENABLED(CONFIG_BT_FEATURE_DEBUG) #define BT_DBG(fmt, ...) bt_dbg(fmt "\n", ##__VA_ARGS__) #else #define BT_DBG(fmt, ...) pr_debug(fmt "\n", ##__VA_ARGS__) #endif #define bt_dev_info(hdev, fmt, ...) \ BT_INFO("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_warn(hdev, fmt, ...) \ BT_WARN("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err(hdev, fmt, ...) \ BT_ERR("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_dbg(hdev, fmt, ...) \ BT_DBG("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_warn_ratelimited(hdev, fmt, ...) \ bt_warn_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) #define bt_dev_err_ratelimited(hdev, fmt, ...) \ bt_err_ratelimited("%s: " fmt, (hdev)->name, ##__VA_ARGS__) /* Connection and socket states */ enum { BT_CONNECTED = 1, /* Equal to TCP_ESTABLISHED to make net code happy */ BT_OPEN, BT_BOUND, BT_LISTEN, BT_CONNECT, BT_CONNECT2, BT_CONFIG, BT_DISCONN, BT_CLOSED }; /* If unused will be removed by compiler */ static inline const char *state_to_string(int state) { switch (state) { case BT_CONNECTED: return "BT_CONNECTED"; case BT_OPEN: return "BT_OPEN"; case BT_BOUND: return "BT_BOUND"; case BT_LISTEN: return "BT_LISTEN"; case BT_CONNECT: return "BT_CONNECT"; case BT_CONNECT2: return "BT_CONNECT2"; case BT_CONFIG: return "BT_CONFIG"; case BT_DISCONN: return "BT_DISCONN"; case BT_CLOSED: return "BT_CLOSED"; } return "invalid state"; } /* BD Address */ typedef struct { __u8 b[6]; } __packed bdaddr_t; /* BD Address type */ #define BDADDR_BREDR 0x00 #define BDADDR_LE_PUBLIC 0x01 #define BDADDR_LE_RANDOM 0x02 static inline bool bdaddr_type_is_valid(u8 type) { switch (type) { case BDADDR_BREDR: case BDADDR_LE_PUBLIC: case BDADDR_LE_RANDOM: return true; } return false; } static inline bool bdaddr_type_is_le(u8 type) { switch (type) { case BDADDR_LE_PUBLIC: case BDADDR_LE_RANDOM: return true; } return false; } #define BDADDR_ANY (&(bdaddr_t) {{0, 0, 0, 0, 0, 0}}) #define BDADDR_NONE (&(bdaddr_t) {{0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}) /* Copy, swap, convert BD Address */ static inline int bacmp(const bdaddr_t *ba1, const bdaddr_t *ba2) { return memcmp(ba1, ba2, sizeof(bdaddr_t)); } static inline void bacpy(bdaddr_t *dst, const bdaddr_t *src) { memcpy(dst, src, sizeof(bdaddr_t)); } void baswap(bdaddr_t *dst, const bdaddr_t *src); /* Common socket structures and functions */ #define bt_sk(__sk) ((struct bt_sock *) __sk) struct bt_sock { struct sock sk; struct list_head accept_q; struct sock *parent; unsigned long flags; void (*skb_msg_name)(struct sk_buff *, void *, int *); void (*skb_put_cmsg)(struct sk_buff *, struct msghdr *, struct sock *); }; enum { BT_SK_DEFER_SETUP, BT_SK_SUSPEND, }; struct bt_sock_list { struct hlist_head head; rwlock_t lock; #ifdef CONFIG_PROC_FS int (* custom_seq_show)(struct seq_file *, void *); #endif }; int bt_sock_register(int proto, const struct net_proto_family *ops); void bt_sock_unregister(int proto); void bt_sock_link(struct bt_sock_list *l, struct sock *s); void bt_sock_unlink(struct bt_sock_list *l, struct sock *s); int bt_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); int bt_sock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, int flags); __poll_t bt_sock_poll(struct file *file, struct socket *sock, poll_table *wait); int bt_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo); int bt_sock_wait_ready(struct sock *sk, unsigned long flags); void bt_accept_enqueue(struct sock *parent, struct sock *sk, bool bh); void bt_accept_unlink(struct sock *sk); struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock); /* Skb helpers */ struct l2cap_ctrl { u8 sframe:1, poll:1, final:1, fcs:1, sar:2, super:2; u16 reqseq; u16 txseq; u8 retries; __le16 psm; bdaddr_t bdaddr; struct l2cap_chan *chan; }; struct sco_ctrl { u8 pkt_status; }; struct hci_dev; typedef void (*hci_req_complete_t)(struct hci_dev *hdev, u8 status, u16 opcode); typedef void (*hci_req_complete_skb_t)(struct hci_dev *hdev, u8 status, u16 opcode, struct sk_buff *skb); #define HCI_REQ_START BIT(0) #define HCI_REQ_SKB BIT(1) struct hci_ctrl { u16 opcode; u8 req_flags; u8 req_event; union { hci_req_complete_t req_complete; hci_req_complete_skb_t req_complete_skb; }; }; struct bt_skb_cb { u8 pkt_type; u8 force_active; u16 expect; u8 incoming:1; union { struct l2cap_ctrl l2cap; struct sco_ctrl sco; struct hci_ctrl hci; }; }; #define bt_cb(skb) ((struct bt_skb_cb *)((skb)->cb)) #define hci_skb_pkt_type(skb) bt_cb((skb))->pkt_type #define hci_skb_expect(skb) bt_cb((skb))->expect #define hci_skb_opcode(skb) bt_cb((skb))->hci.opcode static inline struct sk_buff *bt_skb_alloc(unsigned int len, gfp_t how) { struct sk_buff *skb; skb = alloc_skb(len + BT_SKB_RESERVE, how); if (skb) skb_reserve(skb, BT_SKB_RESERVE); return skb; } static inline struct sk_buff *bt_skb_send_alloc(struct sock *sk, unsigned long len, int nb, int *err) { struct sk_buff *skb; skb = sock_alloc_send_skb(sk, len + BT_SKB_RESERVE, nb, err); if (skb) skb_reserve(skb, BT_SKB_RESERVE); if (!skb && *err) return NULL; *err = sock_error(sk); if (*err) goto out; if (sk->sk_shutdown) { *err = -ECONNRESET; goto out; } return skb; out: kfree_skb(skb); return NULL; } int bt_to_errno(u16 code); void hci_sock_set_flag(struct sock *sk, int nr); void hci_sock_clear_flag(struct sock *sk, int nr); int hci_sock_test_flag(struct sock *sk, int nr); unsigned short hci_sock_get_channel(struct sock *sk); u32 hci_sock_get_cookie(struct sock *sk); int hci_sock_init(void); void hci_sock_cleanup(void); int bt_sysfs_init(void); void bt_sysfs_cleanup(void); int bt_procfs_init(struct net *net, const char *name, struct bt_sock_list *sk_list, int (*seq_show)(struct seq_file *, void *)); void bt_procfs_cleanup(struct net *net, const char *name); extern struct dentry *bt_debugfs; int l2cap_init(void); void l2cap_exit(void); #if IS_ENABLED(CONFIG_BT_BREDR) int sco_init(void); void sco_exit(void); #else static inline int sco_init(void) { return 0; } static inline void sco_exit(void) { } #endif int mgmt_init(void); void mgmt_exit(void); void bt_sock_reclassify_lock(struct sock *sk, int proto); #endif /* __BLUETOOTH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM alarmtimer #if !defined(_TRACE_ALARMTIMER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_ALARMTIMER_H #include <linux/alarmtimer.h> #include <linux/rtc.h> #include <linux/tracepoint.h> TRACE_DEFINE_ENUM(ALARM_REALTIME); TRACE_DEFINE_ENUM(ALARM_BOOTTIME); TRACE_DEFINE_ENUM(ALARM_REALTIME_FREEZER); TRACE_DEFINE_ENUM(ALARM_BOOTTIME_FREEZER); #define show_alarm_type(type) __print_flags(type, " | ", \ { 1 << ALARM_REALTIME, "REALTIME" }, \ { 1 << ALARM_BOOTTIME, "BOOTTIME" }, \ { 1 << ALARM_REALTIME_FREEZER, "REALTIME Freezer" }, \ { 1 << ALARM_BOOTTIME_FREEZER, "BOOTTIME Freezer" }) TRACE_EVENT(alarmtimer_suspend, TP_PROTO(ktime_t expires, int flag), TP_ARGS(expires, flag), TP_STRUCT__entry( __field(s64, expires) __field(unsigned char, alarm_type) ), TP_fast_assign( __entry->expires = expires; __entry->alarm_type = flag; ), TP_printk("alarmtimer type:%s expires:%llu", show_alarm_type((1 << __entry->alarm_type)), __entry->expires ) ); DECLARE_EVENT_CLASS(alarm_class, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now), TP_STRUCT__entry( __field(void *, alarm) __field(unsigned char, alarm_type) __field(s64, expires) __field(s64, now) ), TP_fast_assign( __entry->alarm = alarm; __entry->alarm_type = alarm->type; __entry->expires = alarm->node.expires; __entry->now = now; ), TP_printk("alarmtimer:%p type:%s expires:%llu now:%llu", __entry->alarm, show_alarm_type((1 << __entry->alarm_type)), __entry->expires, __entry->now ) ); DEFINE_EVENT(alarm_class, alarmtimer_fired, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); DEFINE_EVENT(alarm_class, alarmtimer_start, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); DEFINE_EVENT(alarm_class, alarmtimer_cancel, TP_PROTO(struct alarm *alarm, ktime_t now), TP_ARGS(alarm, now) ); #endif /* _TRACE_ALARMTIMER_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 // SPDX-License-Identifier: GPL-2.0-only #include <linux/uaccess.h> #include <linux/kernel.h> #ifdef CONFIG_X86_64 static __always_inline u64 canonical_address(u64 vaddr, u8 vaddr_bits) { return ((s64)vaddr << (64 - vaddr_bits)) >> (64 - vaddr_bits); } bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { unsigned long vaddr = (unsigned long)unsafe_src; /* * Range covering the highest possible canonical userspace address * as well as non-canonical address range. For the canonical range * we also need to include the userspace guard page. */ return vaddr >= TASK_SIZE_MAX + PAGE_SIZE && canonical_address(vaddr, boot_cpu_data.x86_virt_bits) == vaddr; } #else bool copy_from_kernel_nofault_allowed(const void *unsafe_src, size_t size) { return (unsigned long)unsafe_src >= TASK_SIZE_MAX; } #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/mount.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/ns_common.h> #include <linux/fs_pin.h> struct mnt_namespace { atomic_t count; struct ns_common ns; struct mount * root; /* * Traversal and modification of .list is protected by either * - taking namespace_sem for write, OR * - taking namespace_sem for read AND taking .ns_lock. */ struct list_head list; spinlock_t ns_lock; struct user_namespace *user_ns; struct ucounts *ucounts; u64 seq; /* Sequence number to prevent loops */ wait_queue_head_t poll; u64 event; unsigned int mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; } __randomize_layout; struct mnt_pcp { int mnt_count; int mnt_writers; }; struct mountpoint { struct hlist_node m_hash; struct dentry *m_dentry; struct hlist_head m_list; int m_count; }; struct mount { struct hlist_node mnt_hash; struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; union { struct rcu_head mnt_rcu; struct llist_node mnt_llist; }; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; #else int mnt_count; int mnt_writers; #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ struct list_head mnt_instance; /* mount instance on sb->s_mounts */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ struct list_head mnt_share; /* circular list of shared mounts */ struct list_head mnt_slave_list;/* list of slave mounts */ struct list_head mnt_slave; /* slave list entry */ struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ union { struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ struct hlist_node mnt_umount; }; struct list_head mnt_umounting; /* list entry for umount propagation */ #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; #endif int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ int mnt_expiry_mark; /* true if marked for expiry */ struct hlist_head mnt_pins; struct hlist_head mnt_stuck_children; } __randomize_layout; #define MNT_NS_INTERNAL ERR_PTR(-EINVAL) /* distinct from any mnt_namespace */ static inline struct mount *real_mount(struct vfsmount *mnt) { return container_of(mnt, struct mount, mnt); } static inline int mnt_has_parent(struct mount *mnt) { return mnt != mnt->mnt_parent; } static inline int is_mounted(struct vfsmount *mnt) { /* neither detached nor internal? */ return !IS_ERR_OR_NULL(real_mount(mnt)->mnt_ns); } extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); static inline bool __path_is_mountpoint(const struct path *path) { struct mount *m = __lookup_mnt(path->mnt, path->dentry); return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT)); } extern void __detach_mounts(struct dentry *dentry); static inline void detach_mounts(struct dentry *dentry) { if (!d_mountpoint(dentry)) return; __detach_mounts(dentry); } static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); } extern seqlock_t mount_lock; static inline void lock_mount_hash(void) { write_seqlock(&mount_lock); } static inline void unlock_mount_hash(void) { write_sequnlock(&mount_lock); } struct proc_mounts { struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); struct mount cursor; }; extern const struct seq_operations mounts_op; extern bool __is_local_mountpoint(struct dentry *dentry); static inline bool is_local_mountpoint(struct dentry *dentry) { if (!d_mountpoint(dentry)) return false; return __is_local_mountpoint(dentry); } static inline bool is_anon_ns(struct mnt_namespace *ns) { return ns->seq == 0; } extern void mnt_cursor_del(struct mnt_namespace *ns, struct mount *cursor);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOPRIO_H #define IOPRIO_H #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/iocontext.h> /* * Gives us 8 prio classes with 13-bits of data for each class */ #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) #define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) /* * These are the io priority groups as implemented by CFQ. RT is the realtime * class, it always gets premium service. BE is the best-effort scheduling * class, the default for any process. IDLE is the idle scheduling class, it * is only served when no one else is using the disk. */ enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; /* * 8 best effort priority levels are supported */ #define IOPRIO_BE_NR (8) enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; /* * Fallback BE priority */ #define IOPRIO_NORM (4) /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; } /* * This is for the case where the task hasn't asked for a specific IO class. * Check for idle and rt task process, and return appropriate IO class. */ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; else if (task_is_realtime(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; } /* * If the calling process has set an I/O priority, use that. Otherwise, return * the default I/O priority. */ static inline int get_current_ioprio(void) { struct io_context *ioc = current->io_context; if (ioc) return ioc->ioprio; return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); } /* * For inheritance, return the highest of the two given priorities */ extern int ioprio_best(unsigned short aprio, unsigned short bprio); extern int set_task_ioprio(struct task_struct *task, int ioprio); #ifdef CONFIG_BLOCK extern int ioprio_check_cap(int ioprio); #else static inline int ioprio_check_cap(int ioprio) { return -ENOTBLK; } #endif /* CONFIG_BLOCK */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the UDP module. * * Version: @(#)udp.h 1.0.2 05/07/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * * Fixes: * Alan Cox : Turned on udp checksums. I don't want to * chase 'memory corruption' bugs that aren't! */ #ifndef _UDP_H #define _UDP_H #include <linux/list.h> #include <linux/bug.h> #include <net/inet_sock.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <linux/ipv6.h> #include <linux/seq_file.h> #include <linux/poll.h> #include <linux/indirect_call_wrapper.h> /** * struct udp_skb_cb - UDP(-Lite) private variables * * @header: private variables used by IPv4/IPv6 * @cscov: checksum coverage length (UDP-Lite only) * @partial_cov: if set indicates partial csum coverage */ struct udp_skb_cb { union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; __u16 cscov; __u8 partial_cov; }; #define UDP_SKB_CB(__skb) ((struct udp_skb_cb *)((__skb)->cb)) /** * struct udp_hslot - UDP hash slot * * @head: head of list of sockets * @count: number of sockets in 'head' list * @lock: spinlock protecting changes to head/count */ struct udp_hslot { struct hlist_head head; int count; spinlock_t lock; } __attribute__((aligned(2 * sizeof(long)))); /** * struct udp_table - UDP table * * @hash: hash table, sockets are hashed on (local port) * @hash2: hash table, sockets are hashed on (local port, local address) * @mask: number of slots in hash tables, minus 1 * @log: log2(number of slots in hash table) */ struct udp_table { struct udp_hslot *hash; struct udp_hslot *hash2; unsigned int mask; unsigned int log; }; extern struct udp_table udp_table; void udp_table_init(struct udp_table *, const char *); static inline struct udp_hslot *udp_hashslot(struct udp_table *table, struct net *net, unsigned int num) { return &table->hash[udp_hashfn(net, num, table->mask)]; } /* * For secondary hash, net_hash_mix() is performed before calling * udp_hashslot2(), this explains difference with udp_hashslot() */ static inline struct udp_hslot *udp_hashslot2(struct udp_table *table, unsigned int hash) { return &table->hash2[hash & table->mask]; } extern struct proto udp_prot; extern atomic_long_t udp_memory_allocated; /* sysctl variables for udp */ extern long sysctl_udp_mem[3]; extern int sysctl_udp_rmem_min; extern int sysctl_udp_wmem_min; struct sk_buff; /* * Generic checksumming routines for UDP(-Lite) v4 and v6 */ static inline __sum16 __udp_lib_checksum_complete(struct sk_buff *skb) { return (UDP_SKB_CB(skb)->cscov == skb->len ? __skb_checksum_complete(skb) : __skb_checksum_complete_head(skb, UDP_SKB_CB(skb)->cscov)); } static inline int udp_lib_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __udp_lib_checksum_complete(skb); } /** * udp_csum_outgoing - compute UDPv4/v6 checksum over fragments * @sk: socket we are writing to * @skb: sk_buff containing the filled-in UDP header * (checksum field must be zeroed out) */ static inline __wsum udp_csum_outgoing(struct sock *sk, struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), 0); skb_queue_walk(&sk->sk_write_queue, skb) { csum = csum_add(csum, skb->csum); } return csum; } static inline __wsum udp_csum(struct sk_buff *skb) { __wsum csum = csum_partial(skb_transport_header(skb), sizeof(struct udphdr), skb->csum); for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) { csum = csum_add(csum, skb->csum); } return csum; } static inline __sum16 udp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_UDP, base); } void udp_set_csum(bool nocheck, struct sk_buff *skb, __be32 saddr, __be32 daddr, int len); static inline void udp_csum_pull_header(struct sk_buff *skb) { if (!skb->csum_valid && skb->ip_summed == CHECKSUM_NONE) skb->csum = csum_partial(skb->data, sizeof(struct udphdr), skb->csum); skb_pull_rcsum(skb, sizeof(struct udphdr)); UDP_SKB_CB(skb)->cscov -= sizeof(struct udphdr); } typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport, __be16 dport); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp4_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp4_gro_complete(struct sk_buff *, int)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *udp6_gro_receive(struct list_head *, struct sk_buff *)); INDIRECT_CALLABLE_DECLARE(int udp6_gro_complete(struct sk_buff *, int)); struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, struct udphdr *uh, struct sock *sk); int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup); struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, netdev_features_t features, bool is_ipv6); static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb) { struct udphdr *uh; unsigned int hlen, off; off = skb_gro_offset(skb); hlen = off + sizeof(*uh); uh = skb_gro_header_fast(skb, off); if (skb_gro_header_hard(skb, hlen)) uh = skb_gro_header_slow(skb, hlen, off); return uh; } /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */ static inline int udp_lib_hash(struct sock *sk) { BUG(); return 0; } void udp_lib_unhash(struct sock *sk); void udp_lib_rehash(struct sock *sk, u16 new_hash); static inline void udp_lib_close(struct sock *sk, long timeout) { sk_common_release(sk); } int udp_lib_get_port(struct sock *sk, unsigned short snum, unsigned int hash2_nulladdr); u32 udp_flow_hashrnd(void); static inline __be16 udp_flow_src_port(struct net *net, struct sk_buff *skb, int min, int max, bool use_eth) { u32 hash; if (min >= max) { /* Use default range */ inet_get_local_port_range(net, &min, &max); } hash = skb_get_hash(skb); if (unlikely(!hash)) { if (use_eth) { /* Can't find a normal hash, caller has indicated an * Ethernet packet so use that to compute a hash. */ hash = jhash(skb->data, 2 * ETH_ALEN, (__force u32) skb->protocol); } else { /* Can't derive any sort of hash for the packet, set * to some consistent random value. */ hash = udp_flow_hashrnd(); } } /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only upper 16 bits are relevant in the * computation for 16 bit port value. */ hash ^= hash << 16; return htons((((u64) hash * (max - min)) >> 32) + min); } static inline int udp_rqueue_get(struct sock *sk) { return sk_rmem_alloc_get(sk) - READ_ONCE(udp_sk(sk)->forward_deficit); } static inline bool udp_sk_bound_dev_eq(struct net *net, int bound_dev_if, int dif, int sdif) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) return inet_bound_dev_eq(!!net->ipv4.sysctl_udp_l3mdev_accept, bound_dev_if, dif, sdif); #else return inet_bound_dev_eq(true, bound_dev_if, dif, sdif); #endif } /* net/ipv4/udp.c */ void udp_destruct_sock(struct sock *sk); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len); int __udp_enqueue_schedule_skb(struct sock *sk, struct sk_buff *skb); void udp_skb_destructor(struct sock *sk, struct sk_buff *skb); struct sk_buff *__skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *off, int *err); static inline struct sk_buff *skb_recv_udp(struct sock *sk, unsigned int flags, int noblock, int *err) { int off = 0; return __skb_recv_udp(sk, flags, noblock, &off, err); } int udp_v4_early_demux(struct sk_buff *skb); bool udp_sk_rx_dst_set(struct sock *sk, struct dst_entry *dst); int udp_get_port(struct sock *sk, unsigned short snum, int (*saddr_cmp)(const struct sock *, const struct sock *)); int udp_err(struct sk_buff *, u32); int udp_abort(struct sock *sk, int err); int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len); int udp_push_pending_frames(struct sock *sk); void udp_flush_pending_frames(struct sock *sk); int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size); void udp4_hwcsum(struct sk_buff *skb, __be32 src, __be32 dst); int udp_rcv(struct sk_buff *skb); int udp_ioctl(struct sock *sk, int cmd, unsigned long arg); int udp_init_sock(struct sock *sk); int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int __udp_disconnect(struct sock *sk, int flags); int udp_disconnect(struct sock *sk, int flags); __poll_t udp_poll(struct file *file, struct socket *sock, poll_table *wait); struct sk_buff *skb_udp_tunnel_segment(struct sk_buff *skb, netdev_features_t features, bool is_ipv6); int udp_lib_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int udp_lib_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen, int (*push_pending_frames)(struct sock *)); struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif); struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp4_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport); struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif); struct sock *__udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, const struct in6_addr *daddr, __be16 dport, int dif, int sdif, struct udp_table *tbl, struct sk_buff *skb); struct sock *udp6_lib_lookup_skb(struct sk_buff *skb, __be16 sport, __be16 dport); /* UDP uses skb->dev_scratch to cache as much information as possible and avoid * possibly multiple cache miss on dequeue() */ struct udp_dev_scratch { /* skb->truesize and the stateless bit are embedded in a single field; * do not use a bitfield since the compiler emits better/smaller code * this way */ u32 _tsize_state; #if BITS_PER_LONG == 64 /* len and the bit needed to compute skb_csum_unnecessary * will be on cold cache lines at recvmsg time. * skb->len can be stored on 16 bits since the udp header has been * already validated and pulled. */ u16 len; bool is_linear; bool csum_unnecessary; #endif }; static inline struct udp_dev_scratch *udp_skb_scratch(struct sk_buff *skb) { return (struct udp_dev_scratch *)&skb->dev_scratch; } #if BITS_PER_LONG == 64 static inline unsigned int udp_skb_len(struct sk_buff *skb) { return udp_skb_scratch(skb)->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return udp_skb_scratch(skb)->csum_unnecessary; } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return udp_skb_scratch(skb)->is_linear; } #else static inline unsigned int udp_skb_len(struct sk_buff *skb) { return skb->len; } static inline bool udp_skb_csum_unnecessary(struct sk_buff *skb) { return skb_csum_unnecessary(skb); } static inline bool udp_skb_is_linear(struct sk_buff *skb) { return !skb_is_nonlinear(skb); } #endif static inline int copy_linear_skb(struct sk_buff *skb, int len, int off, struct iov_iter *to) { int n; n = copy_to_iter(skb->data + off, len, to); if (n == len) return 0; iov_iter_revert(to, n); return -EFAULT; } /* * SNMP statistics for UDP and UDP-Lite */ #define UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_statistics, field); \ else __SNMP_INC_STATS((net)->mib.udp_statistics, field); } while(0) #define __UDP6_INC_STATS(net, field, is_udplite) do { \ if (is_udplite) __SNMP_INC_STATS((net)->mib.udplite_stats_in6, field);\ else __SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #define UDP6_INC_STATS(net, field, __lite) do { \ if (__lite) SNMP_INC_STATS((net)->mib.udplite_stats_in6, field); \ else SNMP_INC_STATS((net)->mib.udp_stats_in6, field); \ } while(0) #if IS_ENABLED(CONFIG_IPV6) #define __UDPX_MIB(sk, ipv4) \ ({ \ ipv4 ? (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics) : \ (IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_stats_in6 : \ sock_net(sk)->mib.udp_stats_in6); \ }) #else #define __UDPX_MIB(sk, ipv4) \ ({ \ IS_UDPLITE(sk) ? sock_net(sk)->mib.udplite_statistics : \ sock_net(sk)->mib.udp_statistics; \ }) #endif #define __UDPX_INC_STATS(sk, field) \ __SNMP_INC_STATS(__UDPX_MIB(sk, (sk)->sk_family == AF_INET), field) #ifdef CONFIG_PROC_FS struct udp_seq_afinfo { sa_family_t family; struct udp_table *udp_table; }; struct udp_iter_state { struct seq_net_private p; int bucket; struct udp_seq_afinfo *bpf_seq_afinfo; }; void *udp_seq_start(struct seq_file *seq, loff_t *pos); void *udp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void udp_seq_stop(struct seq_file *seq, void *v); extern const struct seq_operations udp_seq_ops; extern const struct seq_operations udp6_seq_ops; int udp4_proc_init(void); void udp4_proc_exit(void); #endif /* CONFIG_PROC_FS */ int udpv4_offload_init(void); void udp_init(void); DECLARE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void); #if IS_ENABLED(CONFIG_IPV6) DECLARE_STATIC_KEY_FALSE(udpv6_encap_needed_key); void udpv6_encap_enable(void); #endif static inline struct sk_buff *udp_rcv_segment(struct sock *sk, struct sk_buff *skb, bool ipv4) { netdev_features_t features = NETIF_F_SG; struct sk_buff *segs; /* Avoid csum recalculation by skb_segment unless userspace explicitly * asks for the final checksum values */ if (!inet_get_convert_csum(sk)) features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM; /* UDP segmentation expects packets of type CHECKSUM_PARTIAL or * CHECKSUM_NONE in __udp_gso_segment. UDP GRO indeed builds partial * packets in udp_gro_complete_segment. As does UDP GSO, verified by * udp_send_skb. But when those packets are looped in dev_loopback_xmit * their ip_summed CHECKSUM_NONE is changed to CHECKSUM_UNNECESSARY. * Reset in this specific case, where PARTIAL is both correct and * required. */ if (skb->pkt_type == PACKET_LOOPBACK) skb->ip_summed = CHECKSUM_PARTIAL; /* the GSO CB lays after the UDP one, no need to save and restore any * CB fragment */ segs = __skb_gso_segment(skb, features, false); if (IS_ERR_OR_NULL(segs)) { int segs_nr = skb_shinfo(skb)->gso_segs; atomic_add(segs_nr, &sk->sk_drops); SNMP_ADD_STATS(__UDPX_MIB(sk, ipv4), UDP_MIB_INERRORS, segs_nr); kfree_skb(skb); return NULL; } consume_skb(skb); return segs; } #ifdef CONFIG_BPF_STREAM_PARSER struct sk_psock; struct proto *udp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); #endif /* BPF_STREAM_PARSER */ #endif /* _UDP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PGALLOC_H #define _ASM_X86_PGALLOC_H #include <linux/threads.h> #include <linux/mm.h> /* for struct page */ #include <linux/pagemap.h> #define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE #include <asm-generic/pgalloc.h> static inline int __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define paravirt_pgd_alloc(mm) __paravirt_pgd_alloc(mm) static inline void paravirt_pgd_free(struct mm_struct *mm, pgd_t *pgd) {} static inline void paravirt_alloc_pte(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count) {} static inline void paravirt_alloc_pud(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_alloc_p4d(struct mm_struct *mm, unsigned long pfn) {} static inline void paravirt_release_pte(unsigned long pfn) {} static inline void paravirt_release_pmd(unsigned long pfn) {} static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif /* * Flags to use when allocating a user page table page. */ extern gfp_t __userpte_alloc_gfp; #ifdef CONFIG_PAGE_TABLE_ISOLATION /* * Instead of one PGD, we acquire two PGDs. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 * in a pointer to swap between the two 4k halves. */ #define PGD_ALLOCATION_ORDER 1 #else #define PGD_ALLOCATION_ORDER 0 #endif /* * Allocate and free page tables. */ extern pgd_t *pgd_alloc(struct mm_struct *); extern void pgd_free(struct mm_struct *mm, pgd_t *pgd); extern pgtable_t pte_alloc_one(struct mm_struct *); extern void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte); static inline void __pte_free_tlb(struct mmu_gather *tlb, struct page *pte, unsigned long address) { ___pte_free_tlb(tlb, pte); } static inline void pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); set_pmd(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } static inline void pmd_populate_kernel_safe(struct mm_struct *mm, pmd_t *pmd, pte_t *pte) { paravirt_alloc_pte(mm, __pa(pte) >> PAGE_SHIFT); set_pmd_safe(pmd, __pmd(__pa(pte) | _PAGE_TABLE)); } static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd, struct page *pte) { unsigned long pfn = page_to_pfn(pte); paravirt_alloc_pte(mm, pfn); set_pmd(pmd, __pmd(((pteval_t)pfn << PAGE_SHIFT) | _PAGE_TABLE)); } #define pmd_pgtable(pmd) pmd_page(pmd) #if CONFIG_PGTABLE_LEVELS > 2 extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, unsigned long address) { ___pmd_free_tlb(tlb, pmd); } #ifdef CONFIG_X86_PAE extern void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd); #else /* !CONFIG_X86_PAE */ static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd))); } static inline void pud_populate_safe(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) { paravirt_alloc_pmd(mm, __pa(pmd) >> PAGE_SHIFT); set_pud_safe(pud, __pud(_PAGE_TABLE | __pa(pmd))); } #endif /* CONFIG_X86_PAE */ #if CONFIG_PGTABLE_LEVELS > 3 static inline void p4d_populate(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); set_p4d(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } static inline void p4d_populate_safe(struct mm_struct *mm, p4d_t *p4d, pud_t *pud) { paravirt_alloc_pud(mm, __pa(pud) >> PAGE_SHIFT); set_p4d_safe(p4d, __p4d(_PAGE_TABLE | __pa(pud))); } extern void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud); static inline void __pud_free_tlb(struct mmu_gather *tlb, pud_t *pud, unsigned long address) { ___pud_free_tlb(tlb, pud); } #if CONFIG_PGTABLE_LEVELS > 4 static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } static inline void pgd_populate_safe(struct mm_struct *mm, pgd_t *pgd, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; paravirt_alloc_p4d(mm, __pa(p4d) >> PAGE_SHIFT); set_pgd_safe(pgd, __pgd(_PAGE_TABLE | __pa(p4d))); } static inline p4d_t *p4d_alloc_one(struct mm_struct *mm, unsigned long addr) { gfp_t gfp = GFP_KERNEL_ACCOUNT; if (mm == &init_mm) gfp &= ~__GFP_ACCOUNT; return (p4d_t *)get_zeroed_page(gfp); } static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d) { if (!pgtable_l5_enabled()) return; BUG_ON((unsigned long)p4d & (PAGE_SIZE-1)); free_page((unsigned long)p4d); } extern void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d); static inline void __p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d, unsigned long address) { if (pgtable_l5_enabled()) ___p4d_free_tlb(tlb, p4d); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ #endif /* CONFIG_PGTABLE_LEVELS > 2 */ #endif /* _ASM_X86_PGALLOC_H */
1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/acl.c * * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> */ #include <linux/quotaops.h> #include "ext4_jbd2.h" #include "ext4.h" #include "xattr.h" #include "acl.h" /* * Convert from filesystem to in-memory representation. */ static struct posix_acl * ext4_acl_from_disk(const void *value, size_t size) { const char *end = (char *)value + size; int n, count; struct posix_acl *acl; if (!value) return NULL; if (size < sizeof(ext4_acl_header)) return ERR_PTR(-EINVAL); if (((ext4_acl_header *)value)->a_version != cpu_to_le32(EXT4_ACL_VERSION)) return ERR_PTR(-EINVAL); value = (char *)value + sizeof(ext4_acl_header); count = ext4_acl_count(size); if (count < 0) return ERR_PTR(-EINVAL); if (count == 0) return NULL; acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n = 0; n < count; n++) { ext4_acl_entry *entry = (ext4_acl_entry *)value; if ((char *)value + sizeof(ext4_acl_entry_short) > end) goto fail; acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); switch (acl->a_entries[n].e_tag) { case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: value = (char *)value + sizeof(ext4_acl_entry_short); break; case ACL_USER: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); break; case ACL_GROUP: value = (char *)value + sizeof(ext4_acl_entry); if ((char *)value > end) goto fail; acl->a_entries[n].e_gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); break; default: goto fail; } } if (value != end) goto fail; return acl; fail: posix_acl_release(acl); return ERR_PTR(-EINVAL); } /* * Convert from in-memory to filesystem representation. */ static void * ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) { ext4_acl_header *ext_acl; char *e; size_t n; *size = ext4_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * sizeof(ext4_acl_entry), GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); e = (char *)ext_acl + sizeof(ext4_acl_header); for (n = 0; n < acl->a_count; n++) { const struct posix_acl_entry *acl_e = &acl->a_entries[n]; ext4_acl_entry *entry = (ext4_acl_entry *)e; entry->e_tag = cpu_to_le16(acl_e->e_tag); entry->e_perm = cpu_to_le16(acl_e->e_perm); switch (acl_e->e_tag) { case ACL_USER: entry->e_id = cpu_to_le32( from_kuid(&init_user_ns, acl_e->e_uid)); e += sizeof(ext4_acl_entry); break; case ACL_GROUP: entry->e_id = cpu_to_le32( from_kgid(&init_user_ns, acl_e->e_gid)); e += sizeof(ext4_acl_entry); break; case ACL_USER_OBJ: case ACL_GROUP_OBJ: case ACL_MASK: case ACL_OTHER: e += sizeof(ext4_acl_entry_short); break; default: goto fail; } } return (char *)ext_acl; fail: kfree(ext_acl); return ERR_PTR(-EINVAL); } /* * Inode operation get_posix_acl(). * * inode->i_mutex: don't care */ struct posix_acl * ext4_get_acl(struct inode *inode, int type) { int name_index; char *value = NULL; struct posix_acl *acl; int retval; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; break; default: BUG(); } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { value = kmalloc(retval, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = ext4_xattr_get(inode, name_index, "", value, retval); } if (retval > 0) acl = ext4_acl_from_disk(value, retval); else if (retval == -ENODATA || retval == -ENOSYS) acl = NULL; else acl = ERR_PTR(retval); kfree(value); return acl; } /* * Set the access or default ACL of an inode. * * inode->i_mutex: down unless called from ext4_new_inode */ static int __ext4_set_acl(handle_t *handle, struct inode *inode, int type, struct posix_acl *acl, int xattr_flags) { int name_index; void *value = NULL; size_t size = 0; int error; switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name_index = EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; default: return -EINVAL; } if (acl) { value = ext4_acl_to_disk(acl, &size); if (IS_ERR(value)) return (int)PTR_ERR(value); } error = ext4_xattr_set_handle(handle, inode, name_index, "", value, size, xattr_flags); kfree(value); if (!error) set_cached_acl(inode, type, acl); return error; } int ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; int error, credits, retries = 0; size_t acl_size = acl ? ext4_acl_size(acl->a_count) : 0; umode_t mode = inode->i_mode; int update_mode = 0; error = dquot_initialize(inode); if (error) return error; retry: error = ext4_xattr_set_credits(inode, acl_size, false /* is_create */, &credits); if (error) return error; handle = ext4_journal_start(inode, EXT4_HT_XATTR, credits); if (IS_ERR(handle)) return PTR_ERR(handle); ext4_fc_start_update(inode); if ((type == ACL_TYPE_ACCESS) && acl) { error = posix_acl_update_mode(inode, &mode, &acl); if (error) goto out_stop; if (mode != inode->i_mode) update_mode = 1; } error = __ext4_set_acl(handle, inode, type, acl, 0 /* xattr_flags */); if (!error && update_mode) { inode->i_mode = mode; inode->i_ctime = current_time(inode); error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); ext4_fc_stop_update(inode); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; return error; } /* * Initialize the ACLs of a new inode. Called from ext4_new_inode. * * dir->i_mutex: down * inode->i_mutex: up (access to inode is still exclusive) */ int ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { struct posix_acl *default_acl, *acl; int error; error = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); if (error) return error; if (default_acl) { error = __ext4_set_acl(handle, inode, ACL_TYPE_DEFAULT, default_acl, XATTR_CREATE); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!error) error = __ext4_set_acl(handle, inode, ACL_TYPE_ACCESS, acl, XATTR_CREATE); posix_acl_release(acl); } else { inode->i_acl = NULL; } return error; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Cryptographic API. * * Copyright (c) 2002 James Morris <jmorris@intercode.com.au> * Copyright (c) 2005 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_H #define _CRYPTO_INTERNAL_H #include <crypto/algapi.h> #include <linux/completion.h> #include <linux/list.h> #include <linux/module.h> #include <linux/notifier.h> #include <linux/numa.h> #include <linux/refcount.h> #include <linux/rwsem.h> #include <linux/sched.h> #include <linux/types.h> struct crypto_instance; struct crypto_template; struct crypto_larval { struct crypto_alg alg; struct crypto_alg *adult; struct completion completion; u32 mask; }; extern struct list_head crypto_alg_list; extern struct rw_semaphore crypto_alg_sem; extern struct blocking_notifier_head crypto_chain; #ifdef CONFIG_PROC_FS void __init crypto_init_proc(void); void __exit crypto_exit_proc(void); #else static inline void crypto_init_proc(void) { } static inline void crypto_exit_proc(void) { } #endif static inline unsigned int crypto_cipher_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } static inline unsigned int crypto_compress_ctxsize(struct crypto_alg *alg) { return alg->cra_ctxsize; } struct crypto_alg *crypto_mod_get(struct crypto_alg *alg); struct crypto_alg *crypto_alg_mod_lookup(const char *name, u32 type, u32 mask); struct crypto_larval *crypto_larval_alloc(const char *name, u32 type, u32 mask); void crypto_larval_kill(struct crypto_alg *alg); void crypto_alg_tested(const char *name, int err); void crypto_remove_spawns(struct crypto_alg *alg, struct list_head *list, struct crypto_alg *nalg); void crypto_remove_final(struct list_head *list); void crypto_shoot_alg(struct crypto_alg *alg); struct crypto_tfm *__crypto_alloc_tfm(struct crypto_alg *alg, u32 type, u32 mask); void *crypto_create_tfm_node(struct crypto_alg *alg, const struct crypto_type *frontend, int node); static inline void *crypto_create_tfm(struct crypto_alg *alg, const struct crypto_type *frontend) { return crypto_create_tfm_node(alg, frontend, NUMA_NO_NODE); } struct crypto_alg *crypto_find_alg(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask); void *crypto_alloc_tfm_node(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask, int node); static inline void *crypto_alloc_tfm(const char *alg_name, const struct crypto_type *frontend, u32 type, u32 mask) { return crypto_alloc_tfm_node(alg_name, frontend, type, mask, NUMA_NO_NODE); } int crypto_probing_notify(unsigned long val, void *v); unsigned int crypto_alg_extsize(struct crypto_alg *alg); int crypto_type_has_alg(const char *name, const struct crypto_type *frontend, u32 type, u32 mask); static inline struct crypto_alg *crypto_alg_get(struct crypto_alg *alg) { refcount_inc(&alg->cra_refcnt); return alg; } static inline void crypto_alg_put(struct crypto_alg *alg) { if (refcount_dec_and_test(&alg->cra_refcnt) && alg->cra_destroy) alg->cra_destroy(alg); } static inline int crypto_tmpl_get(struct crypto_template *tmpl) { return try_module_get(tmpl->module); } static inline void crypto_tmpl_put(struct crypto_template *tmpl) { module_put(tmpl->module); } static inline int crypto_is_larval(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_LARVAL; } static inline int crypto_is_dead(struct crypto_alg *alg) { return alg->cra_flags & CRYPTO_ALG_DEAD; } static inline int crypto_is_moribund(struct crypto_alg *alg) { return alg->cra_flags & (CRYPTO_ALG_DEAD | CRYPTO_ALG_DYING); } static inline void crypto_notify(unsigned long val, void *v) { blocking_notifier_call_chain(&crypto_chain, val, v); } static inline void crypto_yield(u32 flags) { if (flags & CRYPTO_TFM_REQ_MAY_SLEEP) cond_resched(); } #endif /* _CRYPTO_INTERNAL_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NODEMASK_H #define __LINUX_NODEMASK_H /* * Nodemasks provide a bitmap suitable for representing the * set of Node's in a system, one bit position per Node number. * * See detailed comments in the file linux/bitmap.h describing the * data type on which these nodemasks are based. * * For details of nodemask_parse_user(), see bitmap_parse_user() in * lib/bitmap.c. For details of nodelist_parse(), see bitmap_parselist(), * also in bitmap.c. For details of node_remap(), see bitmap_bitremap in * lib/bitmap.c. For details of nodes_remap(), see bitmap_remap in * lib/bitmap.c. For details of nodes_onto(), see bitmap_onto in * lib/bitmap.c. For details of nodes_fold(), see bitmap_fold in * lib/bitmap.c. * * The available nodemask operations are: * * void node_set(node, mask) turn on bit 'node' in mask * void node_clear(node, mask) turn off bit 'node' in mask * void nodes_setall(mask) set all bits * void nodes_clear(mask) clear all bits * int node_isset(node, mask) true iff bit 'node' set in mask * int node_test_and_set(node, mask) test and set bit 'node' in mask * * void nodes_and(dst, src1, src2) dst = src1 & src2 [intersection] * void nodes_or(dst, src1, src2) dst = src1 | src2 [union] * void nodes_xor(dst, src1, src2) dst = src1 ^ src2 * void nodes_andnot(dst, src1, src2) dst = src1 & ~src2 * void nodes_complement(dst, src) dst = ~src * * int nodes_equal(mask1, mask2) Does mask1 == mask2? * int nodes_intersects(mask1, mask2) Do mask1 and mask2 intersect? * int nodes_subset(mask1, mask2) Is mask1 a subset of mask2? * int nodes_empty(mask) Is mask empty (no bits sets)? * int nodes_full(mask) Is mask full (all bits sets)? * int nodes_weight(mask) Hamming weight - number of set bits * * void nodes_shift_right(dst, src, n) Shift right * void nodes_shift_left(dst, src, n) Shift left * * int first_node(mask) Number lowest set bit, or MAX_NUMNODES * int next_node(node, mask) Next node past 'node', or MAX_NUMNODES * int next_node_in(node, mask) Next node past 'node', or wrap to first, * or MAX_NUMNODES * int first_unset_node(mask) First node not set in mask, or * MAX_NUMNODES * * nodemask_t nodemask_of_node(node) Return nodemask with bit 'node' set * NODE_MASK_ALL Initializer - all bits set * NODE_MASK_NONE Initializer - no bits set * unsigned long *nodes_addr(mask) Array of unsigned long's in mask * * int nodemask_parse_user(ubuf, ulen, mask) Parse ascii string as nodemask * int nodelist_parse(buf, map) Parse ascii string as nodelist * int node_remap(oldbit, old, new) newbit = map(old, new)(oldbit) * void nodes_remap(dst, src, old, new) *dst = map(old, new)(src) * void nodes_onto(dst, orig, relmap) *dst = orig relative to relmap * void nodes_fold(dst, orig, sz) dst bits = orig bits mod sz * * for_each_node_mask(node, mask) for-loop node over mask * * int num_online_nodes() Number of online Nodes * int num_possible_nodes() Number of all possible Nodes * * int node_random(mask) Random node with set bit in mask * * int node_online(node) Is some node online? * int node_possible(node) Is some node possible? * * node_set_online(node) set bit 'node' in node_online_map * node_set_offline(node) clear bit 'node' in node_online_map * * for_each_node(node) for-loop node over node_possible_map * for_each_online_node(node) for-loop node over node_online_map * * Subtlety: * 1) The 'type-checked' form of node_isset() causes gcc (3.3.2, anyway) * to generate slightly worse code. So use a simple one-line #define * for node_isset(), instead of wrapping an inline inside a macro, the * way we do the other calls. * * NODEMASK_SCRATCH * When doing above logical AND, OR, XOR, Remap operations the callers tend to * need temporary nodemask_t's on the stack. But if NODES_SHIFT is large, * nodemask_t's consume too much stack space. NODEMASK_SCRATCH is a helper * for such situations. See below and CPUMASK_ALLOC also. */ #include <linux/threads.h> #include <linux/bitmap.h> #include <linux/minmax.h> #include <linux/numa.h> typedef struct { DECLARE_BITMAP(bits, MAX_NUMNODES); } nodemask_t; extern nodemask_t _unused_nodemask_arg_; /** * nodemask_pr_args - printf args to output a nodemask * @maskp: nodemask to be printed * * Can be used to provide arguments for '%*pb[l]' when printing a nodemask. */ #define nodemask_pr_args(maskp) __nodemask_pr_numnodes(maskp), \ __nodemask_pr_bits(maskp) static inline unsigned int __nodemask_pr_numnodes(const nodemask_t *m) { return m ? MAX_NUMNODES : 0; } static inline const unsigned long *__nodemask_pr_bits(const nodemask_t *m) { return m ? m->bits : NULL; } /* * The inline keyword gives the compiler room to decide to inline, or * not inline a function as it sees best. However, as these functions * are called in both __init and non-__init functions, if they are not * inlined we will end up with a section mis-match error (of the type of * freeable items not being freed). So we must use __always_inline here * to fix the problem. If other functions in the future also end up in * this situation they will also need to be annotated as __always_inline */ #define node_set(node, dst) __node_set((node), &(dst)) static __always_inline void __node_set(int node, volatile nodemask_t *dstp) { set_bit(node, dstp->bits); } #define node_clear(node, dst) __node_clear((node), &(dst)) static inline void __node_clear(int node, volatile nodemask_t *dstp) { clear_bit(node, dstp->bits); } #define nodes_setall(dst) __nodes_setall(&(dst), MAX_NUMNODES) static inline void __nodes_setall(nodemask_t *dstp, unsigned int nbits) { bitmap_fill(dstp->bits, nbits); } #define nodes_clear(dst) __nodes_clear(&(dst), MAX_NUMNODES) static inline void __nodes_clear(nodemask_t *dstp, unsigned int nbits) { bitmap_zero(dstp->bits, nbits); } /* No static inline type checking - see Subtlety (1) above. */ #define node_isset(node, nodemask) test_bit((node), (nodemask).bits) #define node_test_and_set(node, nodemask) \ __node_test_and_set((node), &(nodemask)) static inline int __node_test_and_set(int node, nodemask_t *addr) { return test_and_set_bit(node, addr->bits); } #define nodes_and(dst, src1, src2) \ __nodes_and(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_and(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_or(dst, src1, src2) \ __nodes_or(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_or(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_xor(dst, src1, src2) \ __nodes_xor(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_xor(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_andnot(dst, src1, src2) \ __nodes_andnot(&(dst), &(src1), &(src2), MAX_NUMNODES) static inline void __nodes_andnot(nodemask_t *dstp, const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); } #define nodes_complement(dst, src) \ __nodes_complement(&(dst), &(src), MAX_NUMNODES) static inline void __nodes_complement(nodemask_t *dstp, const nodemask_t *srcp, unsigned int nbits) { bitmap_complement(dstp->bits, srcp->bits, nbits); } #define nodes_equal(src1, src2) \ __nodes_equal(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_equal(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_equal(src1p->bits, src2p->bits, nbits); } #define nodes_intersects(src1, src2) \ __nodes_intersects(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_intersects(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_intersects(src1p->bits, src2p->bits, nbits); } #define nodes_subset(src1, src2) \ __nodes_subset(&(src1), &(src2), MAX_NUMNODES) static inline int __nodes_subset(const nodemask_t *src1p, const nodemask_t *src2p, unsigned int nbits) { return bitmap_subset(src1p->bits, src2p->bits, nbits); } #define nodes_empty(src) __nodes_empty(&(src), MAX_NUMNODES) static inline int __nodes_empty(const nodemask_t *srcp, unsigned int nbits) { return bitmap_empty(srcp->bits, nbits); } #define nodes_full(nodemask) __nodes_full(&(nodemask), MAX_NUMNODES) static inline int __nodes_full(const nodemask_t *srcp, unsigned int nbits) { return bitmap_full(srcp->bits, nbits); } #define nodes_weight(nodemask) __nodes_weight(&(nodemask), MAX_NUMNODES) static inline int __nodes_weight(const nodemask_t *srcp, unsigned int nbits) { return bitmap_weight(srcp->bits, nbits); } #define nodes_shift_right(dst, src, n) \ __nodes_shift_right(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_right(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); } #define nodes_shift_left(dst, src, n) \ __nodes_shift_left(&(dst), &(src), (n), MAX_NUMNODES) static inline void __nodes_shift_left(nodemask_t *dstp, const nodemask_t *srcp, int n, int nbits) { bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); } /* FIXME: better would be to fix all architectures to never return > MAX_NUMNODES, then the silly min_ts could be dropped. */ #define first_node(src) __first_node(&(src)) static inline int __first_node(const nodemask_t *srcp) { return min_t(int, MAX_NUMNODES, find_first_bit(srcp->bits, MAX_NUMNODES)); } #define next_node(n, src) __next_node((n), &(src)) static inline int __next_node(int n, const nodemask_t *srcp) { return min_t(int,MAX_NUMNODES,find_next_bit(srcp->bits, MAX_NUMNODES, n+1)); } /* * Find the next present node in src, starting after node n, wrapping around to * the first node in src if needed. Returns MAX_NUMNODES if src is empty. */ #define next_node_in(n, src) __next_node_in((n), &(src)) int __next_node_in(int node, const nodemask_t *srcp); static inline void init_nodemask_of_node(nodemask_t *mask, int node) { nodes_clear(*mask); node_set(node, *mask); } #define nodemask_of_node(node) \ ({ \ typeof(_unused_nodemask_arg_) m; \ if (sizeof(m) == sizeof(unsigned long)) { \ m.bits[0] = 1UL << (node); \ } else { \ init_nodemask_of_node(&m, (node)); \ } \ m; \ }) #define first_unset_node(mask) __first_unset_node(&(mask)) static inline int __first_unset_node(const nodemask_t *maskp) { return min_t(int,MAX_NUMNODES, find_first_zero_bit(maskp->bits, MAX_NUMNODES)); } #define NODE_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(MAX_NUMNODES) #if MAX_NUMNODES <= BITS_PER_LONG #define NODE_MASK_ALL \ ((nodemask_t) { { \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #else #define NODE_MASK_ALL \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-2] = ~0UL, \ [BITS_TO_LONGS(MAX_NUMNODES)-1] = NODE_MASK_LAST_WORD \ } }) #endif #define NODE_MASK_NONE \ ((nodemask_t) { { \ [0 ... BITS_TO_LONGS(MAX_NUMNODES)-1] = 0UL \ } }) #define nodes_addr(src) ((src).bits) #define nodemask_parse_user(ubuf, ulen, dst) \ __nodemask_parse_user((ubuf), (ulen), &(dst), MAX_NUMNODES) static inline int __nodemask_parse_user(const char __user *buf, int len, nodemask_t *dstp, int nbits) { return bitmap_parse_user(buf, len, dstp->bits, nbits); } #define nodelist_parse(buf, dst) __nodelist_parse((buf), &(dst), MAX_NUMNODES) static inline int __nodelist_parse(const char *buf, nodemask_t *dstp, int nbits) { return bitmap_parselist(buf, dstp->bits, nbits); } #define node_remap(oldbit, old, new) \ __node_remap((oldbit), &(old), &(new), MAX_NUMNODES) static inline int __node_remap(int oldbit, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { return bitmap_bitremap(oldbit, oldp->bits, newp->bits, nbits); } #define nodes_remap(dst, src, old, new) \ __nodes_remap(&(dst), &(src), &(old), &(new), MAX_NUMNODES) static inline void __nodes_remap(nodemask_t *dstp, const nodemask_t *srcp, const nodemask_t *oldp, const nodemask_t *newp, int nbits) { bitmap_remap(dstp->bits, srcp->bits, oldp->bits, newp->bits, nbits); } #define nodes_onto(dst, orig, relmap) \ __nodes_onto(&(dst), &(orig), &(relmap), MAX_NUMNODES) static inline void __nodes_onto(nodemask_t *dstp, const nodemask_t *origp, const nodemask_t *relmapp, int nbits) { bitmap_onto(dstp->bits, origp->bits, relmapp->bits, nbits); } #define nodes_fold(dst, orig, sz) \ __nodes_fold(&(dst), &(orig), sz, MAX_NUMNODES) static inline void __nodes_fold(nodemask_t *dstp, const nodemask_t *origp, int sz, int nbits) { bitmap_fold(dstp->bits, origp->bits, sz, nbits); } #if MAX_NUMNODES > 1 #define for_each_node_mask(node, mask) \ for ((node) = first_node(mask); \ (node) < MAX_NUMNODES; \ (node) = next_node((node), (mask))) #else /* MAX_NUMNODES == 1 */ #define for_each_node_mask(node, mask) \ if (!nodes_empty(mask)) \ for ((node) = 0; (node) < 1; (node)++) #endif /* MAX_NUMNODES */ /* * Bitmasks that are kept for all the nodes. */ enum node_states { N_POSSIBLE, /* The node could become online at some point */ N_ONLINE, /* The node is online */ N_NORMAL_MEMORY, /* The node has regular memory */ #ifdef CONFIG_HIGHMEM N_HIGH_MEMORY, /* The node has regular or high memory */ #else N_HIGH_MEMORY = N_NORMAL_MEMORY, #endif N_MEMORY, /* The node has memory(regular, high, movable) */ N_CPU, /* The node has one or more cpus */ N_GENERIC_INITIATOR, /* The node has one or more Generic Initiators */ NR_NODE_STATES }; /* * The following particular system nodemasks and operations * on them manage all possible and online nodes. */ extern nodemask_t node_states[NR_NODE_STATES]; #if MAX_NUMNODES > 1 static inline int node_state(int node, enum node_states state) { return node_isset(node, node_states[state]); } static inline void node_set_state(int node, enum node_states state) { __node_set(node, &node_states[state]); } static inline void node_clear_state(int node, enum node_states state) { __node_clear(node, &node_states[state]); } static inline int num_node_state(enum node_states state) { return nodes_weight(node_states[state]); } #define for_each_node_state(__node, __state) \ for_each_node_mask((__node), node_states[__state]) #define first_online_node first_node(node_states[N_ONLINE]) #define first_memory_node first_node(node_states[N_MEMORY]) static inline int next_online_node(int nid) { return next_node(nid, node_states[N_ONLINE]); } static inline int next_memory_node(int nid) { return next_node(nid, node_states[N_MEMORY]); } extern unsigned int nr_node_ids; extern unsigned int nr_online_nodes; static inline void node_set_online(int nid) { node_set_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } static inline void node_set_offline(int nid) { node_clear_state(nid, N_ONLINE); nr_online_nodes = num_node_state(N_ONLINE); } #else static inline int node_state(int node, enum node_states state) { return node == 0; } static inline void node_set_state(int node, enum node_states state) { } static inline void node_clear_state(int node, enum node_states state) { } static inline int num_node_state(enum node_states state) { return 1; } #define for_each_node_state(node, __state) \ for ( (node) = 0; (node) == 0; (node) = 1) #define first_online_node 0 #define first_memory_node 0 #define next_online_node(nid) (MAX_NUMNODES) #define nr_node_ids 1U #define nr_online_nodes 1U #define node_set_online(node) node_set_state((node), N_ONLINE) #define node_set_offline(node) node_clear_state((node), N_ONLINE) #endif #if defined(CONFIG_NUMA) && (MAX_NUMNODES > 1) extern int node_random(const nodemask_t *maskp); #else static inline int node_random(const nodemask_t *mask) { return 0; } #endif #define node_online_map node_states[N_ONLINE] #define node_possible_map node_states[N_POSSIBLE] #define num_online_nodes() num_node_state(N_ONLINE) #define num_possible_nodes() num_node_state(N_POSSIBLE) #define node_online(node) node_state((node), N_ONLINE) #define node_possible(node) node_state((node), N_POSSIBLE) #define for_each_node(node) for_each_node_state(node, N_POSSIBLE) #define for_each_online_node(node) for_each_node_state(node, N_ONLINE) /* * For nodemask scrach area. * NODEMASK_ALLOC(type, name) allocates an object with a specified type and * name. */ #if NODES_SHIFT > 8 /* nodemask_t > 32 bytes */ #define NODEMASK_ALLOC(type, name, gfp_flags) \ type *name = kmalloc(sizeof(*name), gfp_flags) #define NODEMASK_FREE(m) kfree(m) #else #define NODEMASK_ALLOC(type, name, gfp_flags) type _##name, *name = &_##name #define NODEMASK_FREE(m) do {} while (0) #endif /* A example struture for using NODEMASK_ALLOC, used in mempolicy. */ struct nodemask_scratch { nodemask_t mask1; nodemask_t mask2; }; #define NODEMASK_SCRATCH(x) \ NODEMASK_ALLOC(struct nodemask_scratch, x, \ GFP_KERNEL | __GFP_NORETRY) #define NODEMASK_SCRATCH_FREE(x) NODEMASK_FREE(x) #endif /* __LINUX_NODEMASK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 /* SPDX-License-Identifier: GPL-2.0 OR MIT */ #ifndef __LINUX_OVERFLOW_H #define __LINUX_OVERFLOW_H #include <linux/compiler.h> #include <linux/limits.h> /* * In the fallback code below, we need to compute the minimum and * maximum values representable in a given type. These macros may also * be useful elsewhere, so we provide them outside the * COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW block. * * It would seem more obvious to do something like * * #define type_min(T) (T)(is_signed_type(T) ? (T)1 << (8*sizeof(T)-1) : 0) * #define type_max(T) (T)(is_signed_type(T) ? ((T)1 << (8*sizeof(T)-1)) - 1 : ~(T)0) * * Unfortunately, the middle expressions, strictly speaking, have * undefined behaviour, and at least some versions of gcc warn about * the type_max expression (but not if -fsanitize=undefined is in * effect; in that case, the warning is deferred to runtime...). * * The slightly excessive casting in type_min is to make sure the * macros also produce sensible values for the exotic type _Bool. [The * overflow checkers only almost work for _Bool, but that's * a-feature-not-a-bug, since people shouldn't be doing arithmetic on * _Bools. Besides, the gcc builtins don't allow _Bool* as third * argument.] * * Idea stolen from * https://mail-index.netbsd.org/tech-misc/2007/02/05/0000.html - * credit to Christian Biere. */ #define is_signed_type(type) (((type)(-1)) < (type)1) #define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) #define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) #define type_min(T) ((T)((T)-type_max(T)-(T)1)) /* * Avoids triggering -Wtype-limits compilation warning, * while using unsigned data types to check a < 0. */ #define is_non_negative(a) ((a) > 0 || (a) == 0) #define is_negative(a) (!(is_non_negative(a))) /* * Allows for effectively applying __must_check to a macro so we can have * both the type-agnostic benefits of the macros while also being able to * enforce that the return value is, in fact, checked. */ static inline bool __must_check __must_check_overflow(bool overflow) { return unlikely(overflow); } #ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW /* * For simplicity and code hygiene, the fallback code below insists on * a, b and *d having the same type (similar to the min() and max() * macros), whereas gcc's type-generic overflow checkers accept * different types. Hence we don't just make check_add_overflow an * alias for __builtin_add_overflow, but add type checks similar to * below. */ #define check_add_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_add_overflow(__a, __b, __d); \ })) #define check_sub_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_sub_overflow(__a, __b, __d); \ })) #define check_mul_overflow(a, b, d) __must_check_overflow(({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ __builtin_mul_overflow(__a, __b, __d); \ })) #else /* Checking for unsigned overflow is relatively easy without causing UB. */ #define __unsigned_add_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = __a + __b; \ *__d < __a; \ }) #define __unsigned_sub_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = __a - __b; \ __a < __b; \ }) /* * If one of a or b is a compile-time constant, this avoids a division. */ #define __unsigned_mul_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = __a * __b; \ __builtin_constant_p(__b) ? \ __b > 0 && __a > type_max(typeof(__a)) / __b : \ __a > 0 && __b > type_max(typeof(__b)) / __a; \ }) /* * For signed types, detecting overflow is much harder, especially if * we want to avoid UB. But the interface of these macros is such that * we must provide a result in *d, and in fact we must produce the * result promised by gcc's builtins, which is simply the possibly * wrapped-around value. Fortunately, we can just formally do the * operations in the widest relevant unsigned type (u64) and then * truncate the result - gcc is smart enough to generate the same code * with and without the (u64) casts. */ /* * Adding two signed integers can overflow only if they have the same * sign, and overflow has happened iff the result has the opposite * sign. */ #define __signed_add_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = (u64)__a + (u64)__b; \ (((~(__a ^ __b)) & (*__d ^ __a)) \ & type_min(typeof(__a))) != 0; \ }) /* * Subtraction is similar, except that overflow can now happen only * when the signs are opposite. In this case, overflow has happened if * the result has the opposite sign of a. */ #define __signed_sub_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = (u64)__a - (u64)__b; \ ((((__a ^ __b)) & (*__d ^ __a)) \ & type_min(typeof(__a))) != 0; \ }) /* * Signed multiplication is rather hard. gcc always follows C99, so * division is truncated towards 0. This means that we can write the * overflow check like this: * * (a > 0 && (b > MAX/a || b < MIN/a)) || * (a < -1 && (b > MIN/a || b < MAX/a) || * (a == -1 && b == MIN) * * The redundant casts of -1 are to silence an annoying -Wtype-limits * (included in -Wextra) warning: When the type is u8 or u16, the * __b_c_e in check_mul_overflow obviously selects * __unsigned_mul_overflow, but unfortunately gcc still parses this * code and warns about the limited range of __b. */ #define __signed_mul_overflow(a, b, d) ({ \ typeof(a) __a = (a); \ typeof(b) __b = (b); \ typeof(d) __d = (d); \ typeof(a) __tmax = type_max(typeof(a)); \ typeof(a) __tmin = type_min(typeof(a)); \ (void) (&__a == &__b); \ (void) (&__a == __d); \ *__d = (u64)__a * (u64)__b; \ (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ (__b == (typeof(__b))-1 && __a == __tmin); \ }) #define check_add_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_add_overflow(a, b, d), \ __unsigned_add_overflow(a, b, d))) #define check_sub_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_sub_overflow(a, b, d), \ __unsigned_sub_overflow(a, b, d))) #define check_mul_overflow(a, b, d) __must_check_overflow( \ __builtin_choose_expr(is_signed_type(typeof(a)), \ __signed_mul_overflow(a, b, d), \ __unsigned_mul_overflow(a, b, d))) #endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ /** check_shl_overflow() - Calculate a left-shifted value and check overflow * * @a: Value to be shifted * @s: How many bits left to shift * @d: Pointer to where to store the result * * Computes *@d = (@a << @s) * * Returns true if '*d' cannot hold the result or when 'a << s' doesn't * make sense. Example conditions: * - 'a << s' causes bits to be lost when stored in *d. * - 's' is garbage (e.g. negative) or so large that the result of * 'a << s' is guaranteed to be 0. * - 'a' is negative. * - 'a << s' sets the sign bit, if any, in '*d'. * * '*d' will hold the results of the attempted shift, but is not * considered "safe for use" if false is returned. */ #define check_shl_overflow(a, s, d) __must_check_overflow(({ \ typeof(a) _a = a; \ typeof(s) _s = s; \ typeof(d) _d = d; \ u64 _a_full = _a; \ unsigned int _to_shift = \ is_non_negative(_s) && _s < 8 * sizeof(*d) ? _s : 0; \ *_d = (_a_full << _to_shift); \ (_to_shift != _s || is_negative(*_d) || is_negative(_a) || \ (*_d >> _to_shift) != _a); \ })) /** * array_size() - Calculate size of 2-dimensional array. * * @a: dimension one * @b: dimension two * * Calculates size of 2-dimensional array: @a * @b. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ static inline __must_check size_t array_size(size_t a, size_t b) { size_t bytes; if (check_mul_overflow(a, b, &bytes)) return SIZE_MAX; return bytes; } /** * array3_size() - Calculate size of 3-dimensional array. * * @a: dimension one * @b: dimension two * @c: dimension three * * Calculates size of 3-dimensional array: @a * @b * @c. * * Returns: number of bytes needed to represent the array or SIZE_MAX on * overflow. */ static inline __must_check size_t array3_size(size_t a, size_t b, size_t c) { size_t bytes; if (check_mul_overflow(a, b, &bytes)) return SIZE_MAX; if (check_mul_overflow(bytes, c, &bytes)) return SIZE_MAX; return bytes; } /* * Compute a*b+c, returning SIZE_MAX on overflow. Internal helper for * struct_size() below. */ static inline __must_check size_t __ab_c_size(size_t a, size_t b, size_t c) { size_t bytes; if (check_mul_overflow(a, b, &bytes)) return SIZE_MAX; if (check_add_overflow(bytes, c, &bytes)) return SIZE_MAX; return bytes; } /** * struct_size() - Calculate size of structure with trailing array. * @p: Pointer to the structure. * @member: Name of the array member. * @count: Number of elements in the array. * * Calculates size of memory needed for structure @p followed by an * array of @count number of @member elements. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define struct_size(p, member, count) \ __ab_c_size(count, \ sizeof(*(p)->member) + __must_be_array((p)->member),\ sizeof(*(p))) /** * flex_array_size() - Calculate size of a flexible array member * within an enclosing structure. * * @p: Pointer to the structure. * @member: Name of the flexible array member. * @count: Number of elements in the array. * * Calculates size of a flexible array of @count number of @member * elements, at the end of structure @p. * * Return: number of bytes needed or SIZE_MAX on overflow. */ #define flex_array_size(p, member, count) \ array_size(count, \ sizeof(*(p)->member) + __must_be_array((p)->member)) #endif /* __LINUX_OVERFLOW_H */
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 // SPDX-License-Identifier: GPL-2.0-only /* * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> * * Copyright (C) 2017 Facebook Inc. * Copyright (C) 2017 Dennis Zhou <dennis@kernel.org> * * The percpu allocator handles both static and dynamic areas. Percpu * areas are allocated in chunks which are divided into units. There is * a 1-to-1 mapping for units to possible cpus. These units are grouped * based on NUMA properties of the machine. * * c0 c1 c2 * ------------------- ------------------- ------------ * | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u * ------------------- ...... ------------------- .... ------------ * * Allocation is done by offsets into a unit's address space. Ie., an * area of 512 bytes at 6k in c1 occupies 512 bytes at 6k in c1:u0, * c1:u1, c1:u2, etc. On NUMA machines, the mapping may be non-linear * and even sparse. Access is handled by configuring percpu base * registers according to the cpu to unit mappings and offsetting the * base address using pcpu_unit_size. * * There is special consideration for the first chunk which must handle * the static percpu variables in the kernel image as allocation services * are not online yet. In short, the first chunk is structured like so: * * <Static | [Reserved] | Dynamic> * * The static data is copied from the original section managed by the * linker. The reserved section, if non-zero, primarily manages static * percpu variables from kernel modules. Finally, the dynamic section * takes care of normal allocations. * * The allocator organizes chunks into lists according to free size and * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT * flag should be passed. All memcg-aware allocations are sharing one set * of chunks and all unaccounted allocations and allocations performed * by processes belonging to the root memory cgroup are using the second set. * * The allocator tries to allocate from the fullest chunk first. Each chunk * is managed by a bitmap with metadata blocks. The allocation map is updated * on every allocation and free to reflect the current state while the boundary * map is only updated on allocation. Each metadata block contains * information to help mitigate the need to iterate over large portions * of the bitmap. The reverse mapping from page to chunk is stored in * the page's index. Lastly, units are lazily backed and grow in unison. * * There is a unique conversion that goes on here between bytes and bits. * Each bit represents a fragment of size PCPU_MIN_ALLOC_SIZE. The chunk * tracks the number of pages it is responsible for in nr_pages. Helper * functions are used to convert from between the bytes, bits, and blocks. * All hints are managed in bits unless explicitly stated. * * To use this allocator, arch code should do the following: * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default * * - use pcpu_setup_first_chunk() during percpu area initialization to * setup the first chunk containing the kernel static percpu area */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/bitmap.h> #include <linux/memblock.h> #include <linux/err.h> #include <linux/lcm.h> #include <linux/list.h> #include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> #include <linux/percpu.h> #include <linux/pfn.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> #include <linux/kmemleak.h> #include <linux/sched.h> #include <linux/sched/mm.h> #include <linux/memcontrol.h> #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/io.h> #define CREATE_TRACE_POINTS #include <trace/events/percpu.h> #include "percpu-internal.h" /* the slots are sorted by free bytes left, 1-31 bytes share the same slot */ #define PCPU_SLOT_BASE_SHIFT 5 /* chunks in slots below this are subject to being sidelined on failed alloc */ #define PCPU_SLOT_FAIL_THRESHOLD 3 #define PCPU_EMPTY_POP_PAGES_LOW 2 #define PCPU_EMPTY_POP_PAGES_HIGH 4 #ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ (void __percpu *)((unsigned long)(addr) - \ (unsigned long)pcpu_base_addr + \ (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ (void __force *)((unsigned long)(ptr) + \ (unsigned long)pcpu_base_addr - \ (unsigned long)__per_cpu_start) #endif #else /* CONFIG_SMP */ /* on UP, it's always identity mapped */ #define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) #define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) #endif /* CONFIG_SMP */ static int pcpu_unit_pages __ro_after_init; static int pcpu_unit_size __ro_after_init; static int pcpu_nr_units __ro_after_init; static int pcpu_atom_size __ro_after_init; int pcpu_nr_slots __ro_after_init; static size_t pcpu_chunk_struct_size __ro_after_init; /* cpus with the lowest and highest unit addresses */ static unsigned int pcpu_low_unit_cpu __ro_after_init; static unsigned int pcpu_high_unit_cpu __ro_after_init; /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __ro_after_init; EXPORT_SYMBOL_GPL(pcpu_base_addr); static const int *pcpu_unit_map __ro_after_init; /* cpu -> unit */ const unsigned long *pcpu_unit_offsets __ro_after_init; /* cpu -> unit offset */ /* group information, used for vm allocation */ static int pcpu_nr_groups __ro_after_init; static const unsigned long *pcpu_group_offsets __ro_after_init; static const size_t *pcpu_group_sizes __ro_after_init; /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different * ways and thus often doesn't live in the vmalloc area. */ struct pcpu_chunk *pcpu_first_chunk __ro_after_init; /* * Optional reserved chunk. This chunk reserves part of the first * chunk and serves it for reserved allocations. When the reserved * region doesn't exist, the following variable is NULL. */ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init; DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */ /* chunks which need their map areas extended, protected by pcpu_lock */ static LIST_HEAD(pcpu_map_extend_chunks); /* * The number of empty populated pages by chunk type, protected by pcpu_lock. * The reserved chunk doesn't contribute to the count. */ int pcpu_nr_empty_pop_pages[PCPU_NR_CHUNK_TYPES]; /* * The number of populated pages in use by the allocator, protected by * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets * allocated/deallocated, it is allocated/deallocated in all units of a chunk * and increments/decrements this count by 1). */ static unsigned long pcpu_nr_populated; /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between * PCPU_EMPTY_POP_PAGES_LOW and HIGH for atomic allocations and at most one * empty chunk. */ static void pcpu_balance_workfn(struct work_struct *work); static DECLARE_WORK(pcpu_balance_work, pcpu_balance_workfn); static bool pcpu_async_enabled __read_mostly; static bool pcpu_atomic_alloc_failed; static void pcpu_schedule_balance_work(void) { if (pcpu_async_enabled) schedule_work(&pcpu_balance_work); } /** * pcpu_addr_in_chunk - check if the address is served from this chunk * @chunk: chunk of interest * @addr: percpu address * * RETURNS: * True if the address is served from this chunk. */ static bool pcpu_addr_in_chunk(struct pcpu_chunk *chunk, void *addr) { void *start_addr, *end_addr; if (!chunk) return false; start_addr = chunk->base_addr + chunk->start_offset; end_addr = chunk->base_addr + chunk->nr_pages * PAGE_SIZE - chunk->end_offset; return addr >= start_addr && addr < end_addr; } static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1); } static int pcpu_size_to_slot(int size) { if (size == pcpu_unit_size) return pcpu_nr_slots - 1; return __pcpu_size_to_slot(size); } static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) { const struct pcpu_block_md *chunk_md = &chunk->chunk_md; if (chunk->free_bytes < PCPU_MIN_ALLOC_SIZE || chunk_md->contig_hint == 0) return 0; return pcpu_size_to_slot(chunk_md->contig_hint * PCPU_MIN_ALLOC_SIZE); } /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { page->index = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { return (struct pcpu_chunk *)page->index; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) { return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } static unsigned long pcpu_unit_page_offset(unsigned int cpu, int page_idx) { return pcpu_unit_offsets[cpu] + (page_idx << PAGE_SHIFT); } static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, unsigned int cpu, int page_idx) { return (unsigned long)chunk->base_addr + pcpu_unit_page_offset(cpu, page_idx); } /* * The following are helper functions to help access bitmaps and convert * between bitmap offsets to address offsets. */ static unsigned long *pcpu_index_alloc_map(struct pcpu_chunk *chunk, int index) { return chunk->alloc_map + (index * PCPU_BITMAP_BLOCK_BITS / BITS_PER_LONG); } static unsigned long pcpu_off_to_block_index(int off) { return off / PCPU_BITMAP_BLOCK_BITS; } static unsigned long pcpu_off_to_block_off(int off) { return off & (PCPU_BITMAP_BLOCK_BITS - 1); } static unsigned long pcpu_block_off_to_off(int index, int off) { return index * PCPU_BITMAP_BLOCK_BITS + off; } /* * pcpu_next_hint - determine which hint to use * @block: block of interest * @alloc_bits: size of allocation * * This determines if we should scan based on the scan_hint or first_free. * In general, we want to scan from first_free to fulfill allocations by * first fit. However, if we know a scan_hint at position scan_hint_start * cannot fulfill an allocation, we can begin scanning from there knowing * the contig_hint will be our fallback. */ static int pcpu_next_hint(struct pcpu_block_md *block, int alloc_bits) { /* * The three conditions below determine if we can skip past the * scan_hint. First, does the scan hint exist. Second, is the * contig_hint after the scan_hint (possibly not true iff * contig_hint == scan_hint). Third, is the allocation request * larger than the scan_hint. */ if (block->scan_hint && block->contig_hint_start > block->scan_hint_start && alloc_bits > block->scan_hint) return block->scan_hint_start + block->scan_hint; return block->first_free; } /** * pcpu_next_md_free_region - finds the next hint free area * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Helper function for pcpu_for_each_md_free_region. It checks * block->contig_hint and performs aggregation across blocks to find the * next hint. It modifies bit_off and bits in-place to be consumed in the * loop. */ static void pcpu_next_md_free_region(struct pcpu_chunk *chunk, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; return; } /* * This checks three things. First is there a contig_hint to * check. Second, have we checked this hint before by * comparing the block_off. Third, is this the same as the * right contig hint. In the last case, it spills over into * the next block and should be handled by the contig area * across blocks code. */ *bits = block->contig_hint; if (*bits && block->contig_hint_start >= block_off && *bits + block->contig_hint_start < PCPU_BITMAP_BLOCK_BITS) { *bit_off = pcpu_block_off_to_off(i, block->contig_hint_start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bits = block->right_free; *bit_off = (i + 1) * PCPU_BITMAP_BLOCK_BITS - block->right_free; } } /** * pcpu_next_fit_region - finds fit areas for a given allocation request * @chunk: chunk of interest * @alloc_bits: size of allocation * @align: alignment of area (max PAGE_SIZE) * @bit_off: chunk offset * @bits: size of free area * * Finds the next free region that is viable for use with a given size and * alignment. This only returns if there is a valid area to be used for this * allocation. block->first_free is returned if the allocation request fits * within the block to see if the request can be fulfilled prior to the contig * hint. */ static void pcpu_next_fit_region(struct pcpu_chunk *chunk, int alloc_bits, int align, int *bit_off, int *bits) { int i = pcpu_off_to_block_index(*bit_off); int block_off = pcpu_off_to_block_off(*bit_off); struct pcpu_block_md *block; *bits = 0; for (block = chunk->md_blocks + i; i < pcpu_chunk_nr_blocks(chunk); block++, i++) { /* handles contig area across blocks */ if (*bits) { *bits += block->left_free; if (*bits >= alloc_bits) return; if (block->left_free == PCPU_BITMAP_BLOCK_BITS) continue; } /* check block->contig_hint */ *bits = ALIGN(block->contig_hint_start, align) - block->contig_hint_start; /* * This uses the block offset to determine if this has been * checked in the prior iteration. */ if (block->contig_hint && block->contig_hint_start >= block_off && block->contig_hint >= *bits + alloc_bits) { int start = pcpu_next_hint(block, alloc_bits); *bits += alloc_bits + block->contig_hint_start - start; *bit_off = pcpu_block_off_to_off(i, start); return; } /* reset to satisfy the second predicate above */ block_off = 0; *bit_off = ALIGN(PCPU_BITMAP_BLOCK_BITS - block->right_free, align); *bits = PCPU_BITMAP_BLOCK_BITS - *bit_off; *bit_off = pcpu_block_off_to_off(i, *bit_off); if (*bits >= alloc_bits) return; } /* no valid offsets were found - fail condition */ *bit_off = pcpu_chunk_map_bits(chunk); } /* * Metadata free area iterators. These perform aggregation of free areas * based on the metadata blocks and return the offset @bit_off and size in * bits of the free area @bits. pcpu_for_each_fit_region only returns when * a fit is found for the allocation request. */ #define pcpu_for_each_md_free_region(chunk, bit_off, bits) \ for (pcpu_next_md_free_region((chunk), &(bit_off), &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits) + 1, \ pcpu_next_md_free_region((chunk), &(bit_off), &(bits))) #define pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) \ for (pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits)); \ (bit_off) < pcpu_chunk_map_bits((chunk)); \ (bit_off) += (bits), \ pcpu_next_fit_region((chunk), (alloc_bits), (align), &(bit_off), \ &(bits))) /** * pcpu_mem_zalloc - allocate memory * @size: bytes to allocate * @gfp: allocation flags * * Allocate @size bytes. If @size is smaller than PAGE_SIZE, * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. * This is to facilitate passing through whitelisted flags. The * returned memory is always zeroed. * * RETURNS: * Pointer to the allocated area on success, NULL on failure. */ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) { if (WARN_ON_ONCE(!slab_is_available())) return NULL; if (size <= PAGE_SIZE) return kzalloc(size, gfp); else return __vmalloc(size, gfp | __GFP_ZERO); } /** * pcpu_mem_free - free memory * @ptr: memory to free * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ static void pcpu_mem_free(void *ptr) { kvfree(ptr); } static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot, bool move_front) { if (chunk != pcpu_reserved_chunk) { struct list_head *pcpu_slot; pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk)); if (move_front) list_move(&chunk->list, &pcpu_slot[slot]); else list_move_tail(&chunk->list, &pcpu_slot[slot]); } } static void pcpu_chunk_move(struct pcpu_chunk *chunk, int slot) { __pcpu_chunk_move(chunk, slot, true); } /** * pcpu_chunk_relocate - put chunk in the appropriate chunk slot * @chunk: chunk of interest * @oslot: the previous slot it was on * * This function is called after an allocation or free changed @chunk. * New slot according to the changed state is determined and @chunk is * moved to the slot. Note that the reserved chunk is never put on * chunk slots. * * CONTEXT: * pcpu_lock. */ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) { int nslot = pcpu_chunk_slot(chunk); if (oslot != nslot) __pcpu_chunk_move(chunk, nslot, oslot < nslot); } /* * pcpu_update_empty_pages - update empty page counters * @chunk: chunk of interest * @nr: nr of empty pages * * This is used to keep track of the empty pages now based on the premise * a md_block covers a page. The hint update functions recognize if a block * is made full or broken to calculate deltas for keeping track of free pages. */ static inline void pcpu_update_empty_pages(struct pcpu_chunk *chunk, int nr) { chunk->nr_empty_pop_pages += nr; if (chunk != pcpu_reserved_chunk) pcpu_nr_empty_pop_pages[pcpu_chunk_type(chunk)] += nr; } /* * pcpu_region_overlap - determines if two regions overlap * @a: start of first region, inclusive * @b: end of first region, exclusive * @x: start of second region, inclusive * @y: end of second region, exclusive * * This is used to determine if the hint region [a, b) overlaps with the * allocated region [x, y). */ static inline bool pcpu_region_overlap(int a, int b, int x, int y) { return (a < y) && (x < b); } /** * pcpu_block_update - updates a block given a free area * @block: block of interest * @start: start offset in block * @end: end offset in block * * Updates a block given a known free area. The region [start, end) is * expected to be the entirety of the free area within a block. Chooses * the best starting offset if the contig hints are equal. */ static void pcpu_block_update(struct pcpu_block_md *block, int start, int end) { int contig = end - start; block->first_free = min(block->first_free, start); if (start == 0) block->left_free = contig; if (end == block->nr_bits) block->right_free = contig; if (contig > block->contig_hint) { /* promote the old contig_hint to be the new scan_hint */ if (start > block->contig_hint_start) { if (block->contig_hint > block->scan_hint) { block->scan_hint_start = block->contig_hint_start; block->scan_hint = block->contig_hint; } else if (start < block->scan_hint_start) { /* * The old contig_hint == scan_hint. But, the * new contig is larger so hold the invariant * scan_hint_start < contig_hint_start. */ block->scan_hint = 0; } } else { block->scan_hint = 0; } block->contig_hint_start = start; block->contig_hint = contig; } else if (contig == block->contig_hint) { if (block->contig_hint_start && (!start || __ffs(start) > __ffs(block->contig_hint_start))) { /* start has a better alignment so use it */ block->contig_hint_start = start; if (start < block->scan_hint_start && block->contig_hint > block->scan_hint) block->scan_hint = 0; } else if (start > block->scan_hint_start || block->contig_hint > block->scan_hint) { /* * Knowing contig == contig_hint, update the scan_hint * if it is farther than or larger than the current * scan_hint. */ block->scan_hint_start = start; block->scan_hint = contig; } } else { /* * The region is smaller than the contig_hint. So only update * the scan_hint if it is larger than or equal and farther than * the current scan_hint. */ if ((start < block->contig_hint_start && (contig > block->scan_hint || (contig == block->scan_hint && start > block->scan_hint_start)))) { block->scan_hint_start = start; block->scan_hint = contig; } } } /* * pcpu_block_update_scan - update a block given a free area from a scan * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of free area * * Finding the final allocation spot first goes through pcpu_find_block_fit() * to find a block that can hold the allocation and then pcpu_alloc_area() * where a scan is used. When allocations require specific alignments, * we can inadvertently create holes which will not be seen in the alloc * or free paths. * * This takes a given free area hole and updates a block as it may change the * scan_hint. We need to scan backwards to ensure we don't miss free bits * from alignment. */ static void pcpu_block_update_scan(struct pcpu_chunk *chunk, int bit_off, int bits) { int s_off = pcpu_off_to_block_off(bit_off); int e_off = s_off + bits; int s_index, l_bit; struct pcpu_block_md *block; if (e_off > PCPU_BITMAP_BLOCK_BITS) return; s_index = pcpu_off_to_block_index(bit_off); block = chunk->md_blocks + s_index; /* scan backwards in case of alignment skipping free bits */ l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), s_off); s_off = (s_off == l_bit) ? 0 : l_bit + 1; pcpu_block_update(block, s_off, e_off); } /** * pcpu_chunk_refresh_hint - updates metadata about a chunk * @chunk: chunk of interest * @full_scan: if we should scan from the beginning * * Iterates over the metadata blocks to find the largest contig area. * A full scan can be avoided on the allocation path as this is triggered * if we broke the contig_hint. In doing so, the scan_hint will be before * the contig_hint or after if the scan_hint == contig_hint. This cannot * be prevented on freeing as we want to find the largest area possibly * spanning blocks. */ static void pcpu_chunk_refresh_hint(struct pcpu_chunk *chunk, bool full_scan) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits; /* promote scan_hint to contig_hint */ if (!full_scan && chunk_md->scan_hint) { bit_off = chunk_md->scan_hint_start + chunk_md->scan_hint; chunk_md->contig_hint_start = chunk_md->scan_hint_start; chunk_md->contig_hint = chunk_md->scan_hint; chunk_md->scan_hint = 0; } else { bit_off = chunk_md->first_free; chunk_md->contig_hint = 0; } bits = 0; pcpu_for_each_md_free_region(chunk, bit_off, bits) pcpu_block_update(chunk_md, bit_off, bit_off + bits); } /** * pcpu_block_refresh_hint * @chunk: chunk of interest * @index: index of the metadata block * * Scans over the block beginning at first_free and updates the block * metadata accordingly. */ static void pcpu_block_refresh_hint(struct pcpu_chunk *chunk, int index) { struct pcpu_block_md *block = chunk->md_blocks + index; unsigned long *alloc_map = pcpu_index_alloc_map(chunk, index); unsigned int rs, re, start; /* region start, region end */ /* promote scan_hint to contig_hint */ if (block->scan_hint) { start = block->scan_hint_start + block->scan_hint; block->contig_hint_start = block->scan_hint_start; block->contig_hint = block->scan_hint; block->scan_hint = 0; } else { start = block->first_free; block->contig_hint = 0; } block->right_free = 0; /* iterate over free areas and update the contig hints */ bitmap_for_each_clear_region(alloc_map, rs, re, start, PCPU_BITMAP_BLOCK_BITS) pcpu_block_update(block, rs, re); } /** * pcpu_block_update_hint_alloc - update hint on allocation path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. The metadata only has to be * refreshed by a full scan iff the chunk's contig hint is broken. Block level * scans are required if the block's contig hint is broken. */ static void pcpu_block_update_hint_alloc(struct pcpu_chunk *chunk, int bit_off, int bits) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Update s_block. * block->first_free must be updated if the allocation takes its place. * If the allocation breaks the contig_hint, a scan is required to * restore this hint. */ if (s_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; if (s_off == s_block->first_free) s_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, s_index), PCPU_BITMAP_BLOCK_BITS, s_off + bits); if (pcpu_region_overlap(s_block->scan_hint_start, s_block->scan_hint_start + s_block->scan_hint, s_off, s_off + bits)) s_block->scan_hint = 0; if (pcpu_region_overlap(s_block->contig_hint_start, s_block->contig_hint_start + s_block->contig_hint, s_off, s_off + bits)) { /* block contig hint is broken - scan to fix it */ if (!s_off) s_block->left_free = 0; pcpu_block_refresh_hint(chunk, s_index); } else { /* update left and right contig manually */ s_block->left_free = min(s_block->left_free, s_off); if (s_index == e_index) s_block->right_free = min_t(int, s_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); else s_block->right_free = 0; } /* * Update e_block. */ if (s_index != e_index) { if (e_block->contig_hint == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; /* * When the allocation is across blocks, the end is along * the left part of the e_block. */ e_block->first_free = find_next_zero_bit( pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, e_off); if (e_off == PCPU_BITMAP_BLOCK_BITS) { /* reset the block */ e_block++; } else { if (e_off > e_block->scan_hint_start) e_block->scan_hint = 0; e_block->left_free = 0; if (e_off > e_block->contig_hint_start) { /* contig hint is broken - scan to fix it */ pcpu_block_refresh_hint(chunk, e_index); } else { e_block->right_free = min_t(int, e_block->right_free, PCPU_BITMAP_BLOCK_BITS - e_off); } } /* update in-between md_blocks */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->scan_hint = 0; block->contig_hint = 0; block->left_free = 0; block->right_free = 0; } } if (nr_empty_pages) pcpu_update_empty_pages(chunk, -nr_empty_pages); if (pcpu_region_overlap(chunk_md->scan_hint_start, chunk_md->scan_hint_start + chunk_md->scan_hint, bit_off, bit_off + bits)) chunk_md->scan_hint = 0; /* * The only time a full chunk scan is required is if the chunk * contig hint is broken. Otherwise, it means a smaller space * was used and therefore the chunk contig hint is still correct. */ if (pcpu_region_overlap(chunk_md->contig_hint_start, chunk_md->contig_hint_start + chunk_md->contig_hint, bit_off, bit_off + bits)) pcpu_chunk_refresh_hint(chunk, false); } /** * pcpu_block_update_hint_free - updates the block hints on the free path * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of request * * Updates metadata for the allocation path. This avoids a blind block * refresh by making use of the block contig hints. If this fails, it scans * forward and backward to determine the extent of the free area. This is * capped at the boundary of blocks. * * A chunk update is triggered if a page becomes free, a block becomes free, * or the free spans across blocks. This tradeoff is to minimize iterating * over the block metadata to update chunk_md->contig_hint. * chunk_md->contig_hint may be off by up to a page, but it will never be more * than the available space. If the contig hint is contained in one block, it * will be accurate. */ static void pcpu_block_update_hint_free(struct pcpu_chunk *chunk, int bit_off, int bits) { int nr_empty_pages = 0; struct pcpu_block_md *s_block, *e_block, *block; int s_index, e_index; /* block indexes of the freed allocation */ int s_off, e_off; /* block offsets of the freed allocation */ int start, end; /* start and end of the whole free area */ /* * Calculate per block offsets. * The calculation uses an inclusive range, but the resulting offsets * are [start, end). e_index always points to the last block in the * range. */ s_index = pcpu_off_to_block_index(bit_off); e_index = pcpu_off_to_block_index(bit_off + bits - 1); s_off = pcpu_off_to_block_off(bit_off); e_off = pcpu_off_to_block_off(bit_off + bits - 1) + 1; s_block = chunk->md_blocks + s_index; e_block = chunk->md_blocks + e_index; /* * Check if the freed area aligns with the block->contig_hint. * If it does, then the scan to find the beginning/end of the * larger free area can be avoided. * * start and end refer to beginning and end of the free area * within each their respective blocks. This is not necessarily * the entire free area as it may span blocks past the beginning * or end of the block. */ start = s_off; if (s_off == s_block->contig_hint + s_block->contig_hint_start) { start = s_block->contig_hint_start; } else { /* * Scan backwards to find the extent of the free area. * find_last_bit returns the starting bit, so if the start bit * is returned, that means there was no last bit and the * remainder of the chunk is free. */ int l_bit = find_last_bit(pcpu_index_alloc_map(chunk, s_index), start); start = (start == l_bit) ? 0 : l_bit + 1; } end = e_off; if (e_off == e_block->contig_hint_start) end = e_block->contig_hint_start + e_block->contig_hint; else end = find_next_bit(pcpu_index_alloc_map(chunk, e_index), PCPU_BITMAP_BLOCK_BITS, end); /* update s_block */ e_off = (s_index == e_index) ? end : PCPU_BITMAP_BLOCK_BITS; if (!start && e_off == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(s_block, start, e_off); /* freeing in the same block */ if (s_index != e_index) { /* update e_block */ if (end == PCPU_BITMAP_BLOCK_BITS) nr_empty_pages++; pcpu_block_update(e_block, 0, end); /* reset md_blocks in the middle */ nr_empty_pages += (e_index - s_index - 1); for (block = s_block + 1; block < e_block; block++) { block->first_free = 0; block->scan_hint = 0; block->contig_hint_start = 0; block->contig_hint = PCPU_BITMAP_BLOCK_BITS; block->left_free = PCPU_BITMAP_BLOCK_BITS; block->right_free = PCPU_BITMAP_BLOCK_BITS; } } if (nr_empty_pages) pcpu_update_empty_pages(chunk, nr_empty_pages); /* * Refresh chunk metadata when the free makes a block free or spans * across blocks. The contig_hint may be off by up to a page, but if * the contig_hint is contained in a block, it will be accurate with * the else condition below. */ if (((end - start) >= PCPU_BITMAP_BLOCK_BITS) || s_index != e_index) pcpu_chunk_refresh_hint(chunk, true); else pcpu_block_update(&chunk->chunk_md, pcpu_block_off_to_off(s_index, start), end); } /** * pcpu_is_populated - determines if the region is populated * @chunk: chunk of interest * @bit_off: chunk offset * @bits: size of area * @next_off: return value for the next offset to start searching * * For atomic allocations, check if the backing pages are populated. * * RETURNS: * Bool if the backing pages are populated. * next_index is to skip over unpopulated blocks in pcpu_find_block_fit. */ static bool pcpu_is_populated(struct pcpu_chunk *chunk, int bit_off, int bits, int *next_off) { unsigned int page_start, page_end, rs, re; page_start = PFN_DOWN(bit_off * PCPU_MIN_ALLOC_SIZE); page_end = PFN_UP((bit_off + bits) * PCPU_MIN_ALLOC_SIZE); rs = page_start; bitmap_next_clear_region(chunk->populated, &rs, &re, page_end); if (rs >= page_end) return true; *next_off = re * PAGE_SIZE / PCPU_MIN_ALLOC_SIZE; return false; } /** * pcpu_find_block_fit - finds the block index to start searching * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE bytes) * @pop_only: use populated regions only * * Given a chunk and an allocation spec, find the offset to begin searching * for a free region. This iterates over the bitmap metadata blocks to * find an offset that will be guaranteed to fit the requirements. It is * not quite first fit as if the allocation does not fit in the contig hint * of a block or chunk, it is skipped. This errs on the side of caution * to prevent excess iteration. Poor alignment can cause the allocator to * skip over blocks and chunks that have valid free areas. * * RETURNS: * The offset in the bitmap to begin searching. * -1 if no offset is found. */ static int pcpu_find_block_fit(struct pcpu_chunk *chunk, int alloc_bits, size_t align, bool pop_only) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, next_off; /* * Check to see if the allocation can fit in the chunk's contig hint. * This is an optimization to prevent scanning by assuming if it * cannot fit in the global hint, there is memory pressure and creating * a new chunk would happen soon. */ bit_off = ALIGN(chunk_md->contig_hint_start, align) - chunk_md->contig_hint_start; if (bit_off + alloc_bits > chunk_md->contig_hint) return -1; bit_off = pcpu_next_hint(chunk_md, alloc_bits); bits = 0; pcpu_for_each_fit_region(chunk, alloc_bits, align, bit_off, bits) { if (!pop_only || pcpu_is_populated(chunk, bit_off, bits, &next_off)) break; bit_off = next_off; bits = 0; } if (bit_off == pcpu_chunk_map_bits(chunk)) return -1; return bit_off; } /* * pcpu_find_zero_area - modified from bitmap_find_next_zero_area_off() * @map: the address to base the search on * @size: the bitmap size in bits * @start: the bitnumber to start searching at * @nr: the number of zeroed bits we're looking for * @align_mask: alignment mask for zero area * @largest_off: offset of the largest area skipped * @largest_bits: size of the largest area skipped * * The @align_mask should be one less than a power of 2. * * This is a modified version of bitmap_find_next_zero_area_off() to remember * the largest area that was skipped. This is imperfect, but in general is * good enough. The largest remembered region is the largest failed region * seen. This does not include anything we possibly skipped due to alignment. * pcpu_block_update_scan() does scan backwards to try and recover what was * lost to alignment. While this can cause scanning to miss earlier possible * free areas, smaller allocations will eventually fill those holes. */ static unsigned long pcpu_find_zero_area(unsigned long *map, unsigned long size, unsigned long start, unsigned long nr, unsigned long align_mask, unsigned long *largest_off, unsigned long *largest_bits) { unsigned long index, end, i, area_off, area_bits; again: index = find_next_zero_bit(map, size, start); /* Align allocation */ index = __ALIGN_MASK(index, align_mask); area_off = index; end = index + nr; if (end > size) return end; i = find_next_bit(map, end, index); if (i < end) { area_bits = i - area_off; /* remember largest unused area with best alignment */ if (area_bits > *largest_bits || (area_bits == *largest_bits && *largest_off && (!area_off || __ffs(area_off) > __ffs(*largest_off)))) { *largest_off = area_off; *largest_bits = area_bits; } start = i + 1; goto again; } return index; } /** * pcpu_alloc_area - allocates an area from a pcpu_chunk * @chunk: chunk of interest * @alloc_bits: size of request in allocation units * @align: alignment of area (max PAGE_SIZE) * @start: bit_off to start searching * * This function takes in a @start offset to begin searching to fit an * allocation of @alloc_bits with alignment @align. It needs to scan * the allocation map because if it fits within the block's contig hint, * @start will be block->first_free. This is an attempt to fill the * allocation prior to breaking the contig hint. The allocation and * boundary maps are updated accordingly if it confirms a valid * free area. * * RETURNS: * Allocated addr offset in @chunk on success. * -1 if no matching area is found. */ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits, size_t align, int start) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; size_t align_mask = (align) ? (align - 1) : 0; unsigned long area_off = 0, area_bits = 0; int bit_off, end, oslot; lockdep_assert_held(&pcpu_lock); oslot = pcpu_chunk_slot(chunk); /* * Search to find a fit. */ end = min_t(int, start + alloc_bits + PCPU_BITMAP_BLOCK_BITS, pcpu_chunk_map_bits(chunk)); bit_off = pcpu_find_zero_area(chunk->alloc_map, end, start, alloc_bits, align_mask, &area_off, &area_bits); if (bit_off >= end) return -1; if (area_bits) pcpu_block_update_scan(chunk, area_off, area_bits); /* update alloc map */ bitmap_set(chunk->alloc_map, bit_off, alloc_bits); /* update boundary map */ set_bit(bit_off, chunk->bound_map); bitmap_clear(chunk->bound_map, bit_off + 1, alloc_bits - 1); set_bit(bit_off + alloc_bits, chunk->bound_map); chunk->free_bytes -= alloc_bits * PCPU_MIN_ALLOC_SIZE; /* update first free bit */ if (bit_off == chunk_md->first_free) chunk_md->first_free = find_next_zero_bit( chunk->alloc_map, pcpu_chunk_map_bits(chunk), bit_off + alloc_bits); pcpu_block_update_hint_alloc(chunk, bit_off, alloc_bits); pcpu_chunk_relocate(chunk, oslot); return bit_off * PCPU_MIN_ALLOC_SIZE; } /** * pcpu_free_area - frees the corresponding offset * @chunk: chunk of interest * @off: addr offset into chunk * * This function determines the size of an allocation to free using * the boundary bitmap and clears the allocation map. * * RETURNS: * Number of freed bytes. */ static int pcpu_free_area(struct pcpu_chunk *chunk, int off) { struct pcpu_block_md *chunk_md = &chunk->chunk_md; int bit_off, bits, end, oslot, freed; lockdep_assert_held(&pcpu_lock); pcpu_stats_area_dealloc(chunk); oslot = pcpu_chunk_slot(chunk); bit_off = off / PCPU_MIN_ALLOC_SIZE; /* find end index */ end = find_next_bit(chunk->bound_map, pcpu_chunk_map_bits(chunk), bit_off + 1); bits = end - bit_off; bitmap_clear(chunk->alloc_map, bit_off, bits); freed = bits * PCPU_MIN_ALLOC_SIZE; /* update metadata */ chunk->free_bytes += freed; /* update first free bit */ chunk_md->first_free = min(chunk_md->first_free, bit_off); pcpu_block_update_hint_free(chunk, bit_off, bits); pcpu_chunk_relocate(chunk, oslot); return freed; } static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits) { block->scan_hint = 0; block->contig_hint = nr_bits; block->left_free = nr_bits; block->right_free = nr_bits; block->first_free = 0; block->nr_bits = nr_bits; } static void pcpu_init_md_blocks(struct pcpu_chunk *chunk) { struct pcpu_block_md *md_block; /* init the chunk's block */ pcpu_init_md_block(&chunk->chunk_md, pcpu_chunk_map_bits(chunk)); for (md_block = chunk->md_blocks; md_block != chunk->md_blocks + pcpu_chunk_nr_blocks(chunk); md_block++) pcpu_init_md_block(md_block, PCPU_BITMAP_BLOCK_BITS); } /** * pcpu_alloc_first_chunk - creates chunks that serve the first chunk * @tmp_addr: the start of the region served * @map_size: size of the region served * * This is responsible for creating the chunks that serve the first chunk. The * base_addr is page aligned down of @tmp_addr while the region end is page * aligned up. Offsets are kept track of to determine the region served. All * this is done to appease the bitmap allocator in avoiding partial blocks. * * RETURNS: * Chunk serving the region at @tmp_addr of @map_size. */ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr, int map_size) { struct pcpu_chunk *chunk; unsigned long aligned_addr, lcm_align; int start_offset, offset_bits, region_size, region_bits; size_t alloc_size; /* region calculations */ aligned_addr = tmp_addr & PAGE_MASK; start_offset = tmp_addr - aligned_addr; /* * Align the end of the region with the LCM of PAGE_SIZE and * PCPU_BITMAP_BLOCK_SIZE. One of these constants is a multiple of * the other. */ lcm_align = lcm(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE); region_size = ALIGN(start_offset + map_size, lcm_align); /* allocate chunk */ alloc_size = struct_size(chunk, populated, BITS_TO_LONGS(region_size >> PAGE_SHIFT)); chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); INIT_LIST_HEAD(&chunk->list); chunk->base_addr = (void *)aligned_addr; chunk->start_offset = start_offset; chunk->end_offset = region_size - chunk->start_offset - map_size; chunk->nr_pages = region_size >> PAGE_SHIFT; region_bits = pcpu_chunk_map_bits(chunk); alloc_size = BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]); chunk->alloc_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->alloc_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]); chunk->bound_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->bound_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]); chunk->md_blocks = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!chunk->md_blocks) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); #ifdef CONFIG_MEMCG_KMEM /* first chunk isn't memcg-aware */ chunk->obj_cgroups = NULL; #endif pcpu_init_md_blocks(chunk); /* manage populated page bitmap */ chunk->immutable = true; bitmap_fill(chunk->populated, chunk->nr_pages); chunk->nr_populated = chunk->nr_pages; chunk->nr_empty_pop_pages = chunk->nr_pages; chunk->free_bytes = map_size; if (chunk->start_offset) { /* hide the beginning of the bitmap */ offset_bits = chunk->start_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, 0, offset_bits); set_bit(0, chunk->bound_map); set_bit(offset_bits, chunk->bound_map); chunk->chunk_md.first_free = offset_bits; pcpu_block_update_hint_alloc(chunk, 0, offset_bits); } if (chunk->end_offset) { /* hide the end of the bitmap */ offset_bits = chunk->end_offset / PCPU_MIN_ALLOC_SIZE; bitmap_set(chunk->alloc_map, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); set_bit((start_offset + map_size) / PCPU_MIN_ALLOC_SIZE, chunk->bound_map); set_bit(region_bits, chunk->bound_map); pcpu_block_update_hint_alloc(chunk, pcpu_chunk_map_bits(chunk) - offset_bits, offset_bits); } return chunk; } static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp) { struct pcpu_chunk *chunk; int region_bits; chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); if (!chunk) return NULL; INIT_LIST_HEAD(&chunk->list); chunk->nr_pages = pcpu_unit_pages; region_bits = pcpu_chunk_map_bits(chunk); chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * sizeof(chunk->alloc_map[0]), gfp); if (!chunk->alloc_map) goto alloc_map_fail; chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * sizeof(chunk->bound_map[0]), gfp); if (!chunk->bound_map) goto bound_map_fail; chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * sizeof(chunk->md_blocks[0]), gfp); if (!chunk->md_blocks) goto md_blocks_fail; #ifdef CONFIG_MEMCG_KMEM if (pcpu_is_memcg_chunk(type)) { chunk->obj_cgroups = pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) * sizeof(struct obj_cgroup *), gfp); if (!chunk->obj_cgroups) goto objcg_fail; } #endif pcpu_init_md_blocks(chunk); /* init metadata */ chunk->free_bytes = chunk->nr_pages * PAGE_SIZE; return chunk; #ifdef CONFIG_MEMCG_KMEM objcg_fail: pcpu_mem_free(chunk->md_blocks); #endif md_blocks_fail: pcpu_mem_free(chunk->bound_map); bound_map_fail: pcpu_mem_free(chunk->alloc_map); alloc_map_fail: pcpu_mem_free(chunk); return NULL; } static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; #ifdef CONFIG_MEMCG_KMEM pcpu_mem_free(chunk->obj_cgroups); #endif pcpu_mem_free(chunk->md_blocks); pcpu_mem_free(chunk->bound_map); pcpu_mem_free(chunk->alloc_map); pcpu_mem_free(chunk); } /** * pcpu_chunk_populated - post-population bookkeeping * @chunk: pcpu_chunk which got populated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been populated to @chunk. Update * the bookkeeping information accordingly. Must be called after each * successful population. * * If this is @for_alloc, do not increment pcpu_nr_empty_pop_pages because it * is to serve an allocation in that area. */ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; pcpu_nr_populated += nr; pcpu_update_empty_pages(chunk, nr); } /** * pcpu_chunk_depopulated - post-depopulation bookkeeping * @chunk: pcpu_chunk which got depopulated * @page_start: the start page * @page_end: the end page * * Pages in [@page_start,@page_end) have been depopulated from @chunk. * Update the bookkeeping information accordingly. Must be called after * each successful depopulation. */ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, int page_start, int page_end) { int nr = page_end - page_start; lockdep_assert_held(&pcpu_lock); bitmap_clear(chunk->populated, page_start, nr); chunk->nr_populated -= nr; pcpu_nr_populated -= nr; pcpu_update_empty_pages(chunk, -nr); } /* * Chunk management implementation. * * To allow different implementations, chunk alloc/free and * [de]population are implemented in a separate file which is pulled * into this file and compiled together. The following functions * should be implemented. * * pcpu_populate_chunk - populate the specified range of a chunk * pcpu_depopulate_chunk - depopulate the specified range of a chunk * pcpu_create_chunk - create a new chunk * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop * pcpu_addr_to_page - translate address to physical address * pcpu_verify_alloc_info - check alloc_info is acceptable during init */ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end, gfp_t gfp); static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int page_start, int page_end); static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type, gfp_t gfp); static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); static struct page *pcpu_addr_to_page(void *addr); static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); #ifdef CONFIG_NEED_PER_CPU_KM #include "percpu-km.c" #else #include "percpu-vm.c" #endif /** * pcpu_chunk_addr_search - determine chunk containing specified address * @addr: address for which the chunk needs to be determined. * * This is an internal function that handles all but static allocations. * Static percpu address values should never be passed into the allocator. * * RETURNS: * The address of the found chunk. */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { /* is it in the dynamic region (first chunk)? */ if (pcpu_addr_in_chunk(pcpu_first_chunk, addr)) return pcpu_first_chunk; /* is it in the reserved region? */ if (pcpu_addr_in_chunk(pcpu_reserved_chunk, addr)) return pcpu_reserved_chunk; /* * The address is relative to unit0 which might be unused and * thus unmapped. Offset the address to the unit space of the * current processor before looking it up in the vmalloc * space. Note that any possible cpu id can be used here, so * there's no need to worry about preemption or cpu hotplug. */ addr += pcpu_unit_offsets[raw_smp_processor_id()]; return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } #ifdef CONFIG_MEMCG_KMEM static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { struct obj_cgroup *objcg; if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT)) return PCPU_CHUNK_ROOT; objcg = get_obj_cgroup_from_current(); if (!objcg) return PCPU_CHUNK_ROOT; if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) { obj_cgroup_put(objcg); return PCPU_FAIL_ALLOC; } *objcgp = objcg; return PCPU_CHUNK_MEMCG; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { if (!objcg) return; if (chunk) { chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg; rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, size * num_possible_cpus()); rcu_read_unlock(); } else { obj_cgroup_uncharge(objcg, size * num_possible_cpus()); obj_cgroup_put(objcg); } } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { struct obj_cgroup *objcg; if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk))) return; objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT]; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; obj_cgroup_uncharge(objcg, size * num_possible_cpus()); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, -(size * num_possible_cpus())); rcu_read_unlock(); obj_cgroup_put(objcg); } #else /* CONFIG_MEMCG_KMEM */ static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp) { return PCPU_CHUNK_ROOT; } static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, struct pcpu_chunk *chunk, int off, size_t size) { } static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) { } #endif /* CONFIG_MEMCG_KMEM */ /** * pcpu_alloc - the percpu allocator * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @reserved: allocate from the reserved chunk if available * @gfp: allocation flags * * Allocate percpu area of @size bytes aligned at @align. If @gfp doesn't * contain %GFP_KERNEL, the allocation is atomic. If @gfp has __GFP_NOWARN * then no warning will be triggered on invalid or failed allocation * requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, gfp_t gfp) { gfp_t pcpu_gfp; bool is_atomic; bool do_warn; enum pcpu_chunk_type type; struct list_head *pcpu_slot; struct obj_cgroup *objcg = NULL; static int warn_limit = 10; struct pcpu_chunk *chunk, *next; const char *err; int slot, off, cpu, ret; unsigned long flags; void __percpu *ptr; size_t bits, bit_align; gfp = current_gfp_context(gfp); /* whitelisted flags that can be passed to the backing allocators */ pcpu_gfp = gfp & (GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL; do_warn = !(gfp & __GFP_NOWARN); /* * There is now a minimum allocation size of PCPU_MIN_ALLOC_SIZE, * therefore alignment must be a minimum of that many bytes. * An allocation may have internal fragmentation from rounding up * of up to PCPU_MIN_ALLOC_SIZE - 1 bytes. */ if (unlikely(align < PCPU_MIN_ALLOC_SIZE)) align = PCPU_MIN_ALLOC_SIZE; size = ALIGN(size, PCPU_MIN_ALLOC_SIZE); bits = size >> PCPU_MIN_ALLOC_SHIFT; bit_align = align >> PCPU_MIN_ALLOC_SHIFT; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE || !is_power_of_2(align))) { WARN(do_warn, "illegal size (%zu) or align (%zu) for percpu allocation\n", size, align); return NULL; } type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg); if (unlikely(type == PCPU_FAIL_ALLOC)) return NULL; pcpu_slot = pcpu_chunk_list(type); if (!is_atomic) { /* * pcpu_balance_workfn() allocates memory under this mutex, * and it may wait for memory reclaim. Allow current task * to become OOM victim, in case of memory pressure. */ if (gfp & __GFP_NOFAIL) { mutex_lock(&pcpu_alloc_mutex); } else if (mutex_lock_killable(&pcpu_alloc_mutex)) { pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } } spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { err = "alloc from reserved chunk failed"; goto fail_unlock; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; err = "alloc from reserved chunk failed"; goto fail_unlock; } restart: /* search through normal chunks */ for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) { list_for_each_entry_safe(chunk, next, &pcpu_slot[slot], list) { off = pcpu_find_block_fit(chunk, bits, bit_align, is_atomic); if (off < 0) { if (slot < PCPU_SLOT_FAIL_THRESHOLD) pcpu_chunk_move(chunk, 0); continue; } off = pcpu_alloc_area(chunk, bits, bit_align, off); if (off >= 0) goto area_found; } } spin_unlock_irqrestore(&pcpu_lock, flags); /* * No space left. Create a new chunk. We don't want multiple * tasks to create chunks simultaneously. Serialize and create iff * there's still no empty chunk after grabbing the mutex. */ if (is_atomic) { err = "atomic alloc failed, no space left"; goto fail; } if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(type, pcpu_gfp); if (!chunk) { err = "failed to allocate new chunk"; goto fail; } spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); } else { spin_lock_irqsave(&pcpu_lock, flags); } goto restart; area_found: pcpu_stats_area_alloc(chunk, size); spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ if (!is_atomic) { unsigned int page_start, page_end, rs, re; page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); bitmap_for_each_clear_region(chunk->populated, rs, re, page_start, page_end) { WARN_ON(chunk->immutable); ret = pcpu_populate_chunk(chunk, rs, re, pcpu_gfp); spin_lock_irqsave(&pcpu_lock, flags); if (ret) { pcpu_free_area(chunk, off); err = "failed to populate"; goto fail_unlock; } pcpu_chunk_populated(chunk, rs, re); spin_unlock_irqrestore(&pcpu_lock, flags); } mutex_unlock(&pcpu_alloc_mutex); } if (pcpu_nr_empty_pop_pages[type] < PCPU_EMPTY_POP_PAGES_LOW) pcpu_schedule_balance_work(); /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); kmemleak_alloc_percpu(ptr, size, gfp); trace_percpu_alloc_percpu(reserved, is_atomic, size, align, chunk->base_addr, off, ptr); pcpu_memcg_post_alloc_hook(objcg, chunk, off, size); return ptr; fail_unlock: spin_unlock_irqrestore(&pcpu_lock, flags); fail: trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align); if (!is_atomic && do_warn && warn_limit) { pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n", size, align, is_atomic, err); dump_stack(); if (!--warn_limit) pr_info("limit reached, disable warning\n"); } if (is_atomic) { /* see the flag handling in pcpu_blance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); } else { mutex_unlock(&pcpu_alloc_mutex); } pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size); return NULL; } /** * __alloc_percpu_gfp - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * @gfp: allocation flags * * Allocate zero-filled percpu area of @size bytes aligned at @align. If * @gfp doesn't contain %GFP_KERNEL, the allocation doesn't block and can * be called from any context but is a lot more likely to fail. If @gfp * has __GFP_NOWARN then no warning will be triggered on invalid or failed * allocation requests. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) { return pcpu_alloc(size, align, false, gfp); } EXPORT_SYMBOL_GPL(__alloc_percpu_gfp); /** * __alloc_percpu - allocate dynamic percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Equivalent to __alloc_percpu_gfp(size, align, %GFP_KERNEL). */ void __percpu *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false, GFP_KERNEL); } EXPORT_SYMBOL_GPL(__alloc_percpu); /** * __alloc_reserved_percpu - allocate reserved percpu area * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * * Allocate zero-filled percpu area of @size bytes aligned at @align * from reserved percpu area if arch has set it up; otherwise, * allocation is served from the same dynamic area. Might sleep. * Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. * * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true, GFP_KERNEL); } /** * __pcpu_balance_workfn - manage the amount of free chunks and populated pages * @type: chunk type * * Reclaim all fully free chunks except for the first one. This is also * responsible for maintaining the pool of empty populated pages. However, * it is possible that this is called when physical memory is scarce causing * OOM killer to be triggered. We should avoid doing so until an actual * allocation causes the failure as it is possible that requests can be * serviced from already backed regions. */ static void __pcpu_balance_workfn(enum pcpu_chunk_type type) { /* gfp flags passed to underlying allocators */ const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; LIST_HEAD(to_free); struct list_head *pcpu_slot = pcpu_chunk_list(type); struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; struct pcpu_chunk *chunk, *next; int slot, nr_to_pop, ret; /* * There's no reason to keep around multiple unused chunks and VM * areas can be scarce. Destroy all free chunks except for one. */ mutex_lock(&pcpu_alloc_mutex); spin_lock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, free_head, list) { WARN_ON(chunk->immutable); /* spare the first one */ if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; list_move(&chunk->list, &to_free); } spin_unlock_irq(&pcpu_lock); list_for_each_entry_safe(chunk, next, &to_free, list) { unsigned int rs, re; bitmap_for_each_set_region(chunk->populated, rs, re, 0, chunk->nr_pages) { pcpu_depopulate_chunk(chunk, rs, re); spin_lock_irq(&pcpu_lock); pcpu_chunk_depopulated(chunk, rs, re); spin_unlock_irq(&pcpu_lock); } pcpu_destroy_chunk(chunk); cond_resched(); } /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic * allocs don't increase fragmentation. If atomic allocation * failed previously, always populate the maximum amount. This * should prevent atomic allocs larger than PAGE_SIZE from keeping * failing indefinitely; however, large atomic allocs are not * something we support properly and can be highly unreliable and * inefficient. */ retry_pop: if (pcpu_atomic_alloc_failed) { nr_to_pop = PCPU_EMPTY_POP_PAGES_HIGH; /* best effort anyway, don't worry about synchronization */ pcpu_atomic_alloc_failed = false; } else { nr_to_pop = clamp(PCPU_EMPTY_POP_PAGES_HIGH - pcpu_nr_empty_pop_pages[type], 0, PCPU_EMPTY_POP_PAGES_HIGH); } for (slot = pcpu_size_to_slot(PAGE_SIZE); slot < pcpu_nr_slots; slot++) { unsigned int nr_unpop = 0, rs, re; if (!nr_to_pop) break; spin_lock_irq(&pcpu_lock); list_for_each_entry(chunk, &pcpu_slot[slot], list) { nr_unpop = chunk->nr_pages - chunk->nr_populated; if (nr_unpop) break; } spin_unlock_irq(&pcpu_lock); if (!nr_unpop) continue; /* @chunk can't go away while pcpu_alloc_mutex is held */ bitmap_for_each_clear_region(chunk->populated, rs, re, 0, chunk->nr_pages) { int nr = min_t(int, re - rs, nr_to_pop); ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); if (!ret) { nr_to_pop -= nr; spin_lock_irq(&pcpu_lock); pcpu_chunk_populated(chunk, rs, rs + nr); spin_unlock_irq(&pcpu_lock); } else { nr_to_pop = 0; } if (!nr_to_pop) break; } } if (nr_to_pop) { /* ran out of chunks to populate, create a new one and retry */ chunk = pcpu_create_chunk(type, gfp); if (chunk) { spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); spin_unlock_irq(&pcpu_lock); goto retry_pop; } } mutex_unlock(&pcpu_alloc_mutex); } /** * pcpu_balance_workfn - manage the amount of free chunks and populated pages * @work: unused * * Call __pcpu_balance_workfn() for each chunk type. */ static void pcpu_balance_workfn(struct work_struct *work) { enum pcpu_chunk_type type; for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) __pcpu_balance_workfn(type); } /** * free_percpu - free percpu area * @ptr: pointer to area to free * * Free percpu area @ptr. * * CONTEXT: * Can be called from atomic context. */ void free_percpu(void __percpu *ptr) { void *addr; struct pcpu_chunk *chunk; unsigned long flags; int size, off; bool need_balance = false; struct list_head *pcpu_slot; if (!ptr) return; kmemleak_free_percpu(ptr); addr = __pcpu_ptr_to_addr(ptr); spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); off = addr - chunk->base_addr; size = pcpu_free_area(chunk, off); pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk)); pcpu_memcg_free_hook(chunk, off, size); /* if there are more than one fully free chunks, wake up grim reaper */ if (chunk->free_bytes == pcpu_unit_size) { struct pcpu_chunk *pos; list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list) if (pos != chunk) { need_balance = true; break; } } trace_percpu_free_percpu(chunk->base_addr, off, ptr); spin_unlock_irqrestore(&pcpu_lock, flags); if (need_balance) pcpu_schedule_balance_work(); } EXPORT_SYMBOL_GPL(free_percpu); bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) { #ifdef CONFIG_SMP const size_t static_size = __per_cpu_end - __per_cpu_start; void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); unsigned int cpu; for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); void *va = (void *)addr; if (va >= start && va < start + static_size) { if (can_addr) { *can_addr = (unsigned long) (va - start); *can_addr += (unsigned long) per_cpu_ptr(base, get_boot_cpu_id()); } return true; } } #endif /* on UP, can't distinguish from other static vars, always false */ return false; } /** * is_kernel_percpu_address - test whether address is from static percpu area * @addr: address to test * * Test whether @addr belongs to in-kernel static percpu area. Module * static percpu areas are not considered. For those, use * is_module_percpu_address(). * * RETURNS: * %true if @addr is from in-kernel static percpu area, %false otherwise. */ bool is_kernel_percpu_address(unsigned long addr) { return __is_kernel_percpu_address(addr, NULL); } /** * per_cpu_ptr_to_phys - convert translated percpu address to physical address * @addr: the address to be converted to physical address * * Given @addr which is dereferenceable address obtained via one of * percpu access macros, this function translates it into its physical * address. The caller is responsible for ensuring @addr stays valid * until this function finishes. * * percpu allocator has special setup for the first chunk, which currently * supports either embedding in linear address space or vmalloc mapping, * and, from the second one, the backing allocator (currently either vm or * km) provides translation. * * The addr can be translated simply without checking if it falls into the * first chunk. But the current code reflects better how percpu allocator * actually works, and the verification can discover both bugs in percpu * allocator itself and per_cpu_ptr_to_phys() callers. So we keep current * code. * * RETURNS: * The physical address for @addr. */ phys_addr_t per_cpu_ptr_to_phys(void *addr) { void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); bool in_first_chunk = false; unsigned long first_low, first_high; unsigned int cpu; /* * The following test on unit_low/high isn't strictly * necessary but will speed up lookups of addresses which * aren't in the first chunk. * * The address check is against full chunk sizes. pcpu_base_addr * points to the beginning of the first chunk including the * static region. Assumes good intent as the first chunk may * not be full (ie. < pcpu_unit_pages in size). */ first_low = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_low_unit_cpu, 0); first_high = (unsigned long)pcpu_base_addr + pcpu_unit_page_offset(pcpu_high_unit_cpu, pcpu_unit_pages); if ((unsigned long)addr >= first_low && (unsigned long)addr < first_high) { for_each_possible_cpu(cpu) { void *start = per_cpu_ptr(base, cpu); if (addr >= start && addr < start + pcpu_unit_size) { in_first_chunk = true; break; } } } if (in_first_chunk) { if (!is_vmalloc_addr(addr)) return __pa(addr); else return page_to_phys(vmalloc_to_page(addr)) + offset_in_page(addr); } else return page_to_phys(pcpu_addr_to_page(addr)) + offset_in_page(addr); } /** * pcpu_alloc_alloc_info - allocate percpu allocation info * @nr_groups: the number of groups * @nr_units: the number of units * * Allocate ai which is large enough for @nr_groups groups containing * @nr_units units. The returned ai's groups[0].cpu_map points to the * cpu_map array which is long enough for @nr_units and filled with * NR_CPUS. It's the caller's responsibility to initialize cpu_map * pointer of other groups. * * RETURNS: * Pointer to the allocated pcpu_alloc_info on success, NULL on * failure. */ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, int nr_units) { struct pcpu_alloc_info *ai; size_t base_size, ai_size; void *ptr; int unit; base_size = ALIGN(struct_size(ai, groups, nr_groups), __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); ptr = memblock_alloc(PFN_ALIGN(ai_size), PAGE_SIZE); if (!ptr) return NULL; ai = ptr; ptr += base_size; ai->groups[0].cpu_map = ptr; for (unit = 0; unit < nr_units; unit++) ai->groups[0].cpu_map[unit] = NR_CPUS; ai->nr_groups = nr_groups; ai->__ai_size = PFN_ALIGN(ai_size); return ai; } /** * pcpu_free_alloc_info - free percpu allocation info * @ai: pcpu_alloc_info to free * * Free @ai which was allocated by pcpu_alloc_alloc_info(). */ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) { memblock_free_early(__pa(ai), ai->__ai_size); } /** * pcpu_dump_alloc_info - print out information about pcpu_alloc_info * @lvl: loglevel * @ai: allocation info to dump * * Print out information about @ai using loglevel @lvl. */ static void pcpu_dump_alloc_info(const char *lvl, const struct pcpu_alloc_info *ai) { int group_width = 1, cpu_width = 1, width; char empty_str[] = "--------"; int alloc = 0, alloc_end = 0; int group, v; int upa, apl; /* units per alloc, allocs per line */ v = ai->nr_groups; while (v /= 10) group_width++; v = num_possible_cpus(); while (v /= 10) cpu_width++; empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; upa = ai->alloc_size / ai->unit_size; width = upa * (cpu_width + 1) + group_width + 3; apl = rounddown_pow_of_two(max(60 / width, 1)); printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", lvl, ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); for (group = 0; group < ai->nr_groups; group++) { const struct pcpu_group_info *gi = &ai->groups[group]; int unit = 0, unit_end = 0; BUG_ON(gi->nr_units % upa); for (alloc_end += gi->nr_units / upa; alloc < alloc_end; alloc++) { if (!(alloc % apl)) { pr_cont("\n"); printk("%spcpu-alloc: ", lvl); } pr_cont("[%0*d] ", group_width, group); for (unit_end += upa; unit < unit_end; unit++) if (gi->cpu_map[unit] != NR_CPUS) pr_cont("%0*d ", cpu_width, gi->cpu_map[unit]); else pr_cont("%s ", empty_str); } } pr_cont("\n"); } /** * pcpu_setup_first_chunk - initialize the first percpu chunk * @ai: pcpu_alloc_info describing how to percpu area is shaped * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * percpu area. This function is to be called from arch percpu area * setup path. * * @ai contains all information necessary to initialize the first * chunk and prime the dynamic percpu allocator. * * @ai->static_size is the size of static percpu area. * * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu * static areas on architectures where the addressing model has * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * * @ai->dyn_size determines the number of bytes available for dynamic * allocation in the first chunk. The area between @ai->static_size + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. * * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE * and equal to or larger than @ai->static_size + @ai->reserved_size + * @ai->dyn_size. * * @ai->atom_size is the allocation atom size and used as alignment * for vm areas. * * @ai->alloc_size is the allocation size and always multiple of * @ai->atom_size. This is larger than @ai->atom_size if * @ai->unit_size is larger than @ai->atom_size. * * @ai->nr_groups and @ai->groups describe virtual memory layout of * percpu areas. Units which should be colocated are put into the * same group. Dynamic VM areas will be allocated according to these * groupings. If @ai->nr_groups is zero, a single group containing * all units is assumed. * * The caller should have mapped the first chunk at @base_addr and * copied static data to each unit. * * The first chunk will always contain a static and a dynamic region. * However, the static region is not managed by any chunk. If the first * chunk also contains a reserved region, it is served by two chunks - * one for the reserved region and one for the dynamic region. They * share the same vm, but use offset regions in the area allocation map. * The chunk serving the dynamic region is circulated in the chunk slots * and available for dynamic allocation like any other chunk. */ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; struct pcpu_chunk *chunk; unsigned long *group_offsets; size_t *group_sizes; unsigned long *unit_off; unsigned int cpu; int *unit_map; int group, unit, i; int map_size; unsigned long tmp_addr; size_t alloc_size; enum pcpu_chunk_type type; #define PCPU_SETUP_BUG_ON(cond) do { \ if (unlikely(cond)) { \ pr_emerg("failed to initialize, %s\n", #cond); \ pr_emerg("cpu_possible_mask=%*pb\n", \ cpumask_pr_args(cpu_possible_mask)); \ pcpu_dump_alloc_info(KERN_EMERG, ai); \ BUG(); \ } \ } while (0) /* sanity checks */ PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); #ifdef CONFIG_SMP PCPU_SETUP_BUG_ON(!ai->static_size); PCPU_SETUP_BUG_ON(offset_in_page(__per_cpu_start)); #endif PCPU_SETUP_BUG_ON(!base_addr); PCPU_SETUP_BUG_ON(offset_in_page(base_addr)); PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); PCPU_SETUP_BUG_ON(offset_in_page(ai->unit_size)); PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->unit_size, PCPU_BITMAP_BLOCK_SIZE)); PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); PCPU_SETUP_BUG_ON(!ai->dyn_size); PCPU_SETUP_BUG_ON(!IS_ALIGNED(ai->reserved_size, PCPU_MIN_ALLOC_SIZE)); PCPU_SETUP_BUG_ON(!(IS_ALIGNED(PCPU_BITMAP_BLOCK_SIZE, PAGE_SIZE) || IS_ALIGNED(PAGE_SIZE, PCPU_BITMAP_BLOCK_SIZE))); PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); /* process group information and build config tables accordingly */ alloc_size = ai->nr_groups * sizeof(group_offsets[0]); group_offsets = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!group_offsets) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = ai->nr_groups * sizeof(group_sizes[0]); group_sizes = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!group_sizes) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = nr_cpu_ids * sizeof(unit_map[0]); unit_map = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!unit_map) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); alloc_size = nr_cpu_ids * sizeof(unit_off[0]); unit_off = memblock_alloc(alloc_size, SMP_CACHE_BYTES); if (!unit_off) panic("%s: Failed to allocate %zu bytes\n", __func__, alloc_size); for (cpu = 0; cpu < nr_cpu_ids; cpu++) unit_map[cpu] = UINT_MAX; pcpu_low_unit_cpu = NR_CPUS; pcpu_high_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { const struct pcpu_group_info *gi = &ai->groups[group]; group_offsets[group] = gi->base_offset; group_sizes[group] = gi->nr_units * ai->unit_size; for (i = 0; i < gi->nr_units; i++) { cpu = gi->cpu_map[i]; if (cpu == NR_CPUS) continue; PCPU_SETUP_BUG_ON(cpu >= nr_cpu_ids); PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; /* determine low/high unit_cpu */ if (pcpu_low_unit_cpu == NR_CPUS || unit_off[cpu] < unit_off[pcpu_low_unit_cpu]) pcpu_low_unit_cpu = cpu; if (pcpu_high_unit_cpu == NR_CPUS || unit_off[cpu] > unit_off[pcpu_high_unit_cpu]) pcpu_high_unit_cpu = cpu; } } pcpu_nr_units = unit; for_each_possible_cpu(cpu) PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); /* we're done parsing the input, undefine BUG macro and dump config */ #undef PCPU_SETUP_BUG_ON pcpu_dump_alloc_info(KERN_DEBUG, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; pcpu_group_sizes = group_sizes; pcpu_unit_map = unit_map; pcpu_unit_offsets = unit_off; /* determine basic parameters */ pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; pcpu_atom_size = ai->atom_size; pcpu_chunk_struct_size = struct_size(chunk, populated, BITS_TO_LONGS(pcpu_unit_pages)); pcpu_stats_save_ai(ai); /* * Allocate chunk slots. The additional last slot is for * empty chunks. */ pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2; pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) * PCPU_NR_CHUNK_TYPES, SMP_CACHE_BYTES); if (!pcpu_chunk_lists) panic("%s: Failed to allocate %zu bytes\n", __func__, pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) * PCPU_NR_CHUNK_TYPES); for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) for (i = 0; i < pcpu_nr_slots; i++) INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]); /* * The end of the static region needs to be aligned with the * minimum allocation size as this offsets the reserved and * dynamic region. The first chunk ends page aligned by * expanding the dynamic region, therefore the dynamic region * can be shrunk to compensate while still staying above the * configured sizes. */ static_size = ALIGN(ai->static_size, PCPU_MIN_ALLOC_SIZE); dyn_size = ai->dyn_size - (static_size - ai->static_size); /* * Initialize first chunk. * If the reserved_size is non-zero, this initializes the reserved * chunk. If the reserved_size is zero, the reserved chunk is NULL * and the dynamic region is initialized here. The first chunk, * pcpu_first_chunk, will always point to the chunk that serves * the dynamic region. */ tmp_addr = (unsigned long)base_addr + static_size; map_size = ai->reserved_size ?: dyn_size; chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); /* init dynamic chunk if necessary */ if (ai->reserved_size) { pcpu_reserved_chunk = chunk; tmp_addr = (unsigned long)base_addr + static_size + ai->reserved_size; map_size = dyn_size; chunk = pcpu_alloc_first_chunk(tmp_addr, map_size); } /* link the first chunk in */ pcpu_first_chunk = chunk; pcpu_nr_empty_pop_pages[PCPU_CHUNK_ROOT] = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); /* include all regions of the first chunk */ pcpu_nr_populated += PFN_DOWN(size_sum); pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); /* we're done */ pcpu_base_addr = base_addr; } #ifdef CONFIG_SMP const char * const pcpu_fc_names[PCPU_FC_NR] __initconst = { [PCPU_FC_AUTO] = "auto", [PCPU_FC_EMBED] = "embed", [PCPU_FC_PAGE] = "page", }; enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; static int __init percpu_alloc_setup(char *str) { if (!str) return -EINVAL; if (0) /* nada */; #ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK else if (!strcmp(str, "embed")) pcpu_chosen_fc = PCPU_FC_EMBED; #endif #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK else if (!strcmp(str, "page")) pcpu_chosen_fc = PCPU_FC_PAGE; #endif else pr_warn("unknown allocator %s specified\n", str); return 0; } early_param("percpu_alloc", percpu_alloc_setup); /* * pcpu_embed_first_chunk() is used by the generic percpu setup. * Build it if needed by the arch config or the generic setup is going * to be used. */ #if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) #define BUILD_EMBED_FIRST_CHUNK #endif /* build pcpu_page_first_chunk() iff needed by the arch config */ #if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) #define BUILD_PAGE_FIRST_CHUNK #endif /* pcpu_build_alloc_info() is used by both embed and page first chunk */ #if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) /** * pcpu_build_alloc_info - build alloc_info considering distances between CPUs * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * * This function determines grouping of units, their mappings to cpus * and other parameters considering needed percpu size, allocation * atom size and distances between CPUs. * * Groups are always multiples of atom size and CPUs which are of * LOCAL_DISTANCE both ways are grouped together and share space for * units in the same group. The returned configuration is guaranteed * to have CPUs on different nodes on different groups and >=75% usage * of allocated virtual address space. * * RETURNS: * On success, pointer to the new allocation_info is returned. On * failure, ERR_PTR value is returned. */ static struct pcpu_alloc_info * __init pcpu_build_alloc_info( size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn) { static int group_map[NR_CPUS] __initdata; static int group_cnt[NR_CPUS] __initdata; const size_t static_size = __per_cpu_end - __per_cpu_start; int nr_groups = 1, nr_units = 0; size_t size_sum, min_unit_size, alloc_size; int upa, max_upa, best_upa; /* units_per_alloc */ int last_allocs, group, unit; unsigned int cpu, tcpu; struct pcpu_alloc_info *ai; unsigned int *cpu_map; /* this function may be called multiple times */ memset(group_map, 0, sizeof(group_map)); memset(group_cnt, 0, sizeof(group_cnt)); /* calculate size_sum and ensure dyn_size is enough for early alloc */ size_sum = PFN_ALIGN(static_size + reserved_size + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); dyn_size = size_sum - static_size - reserved_size; /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest * which can accommodate 4k aligned segments which are equal to * or larger than min_unit_size. */ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); /* determine the maximum # of units that can fit in an allocation */ alloc_size = roundup(min_unit_size, atom_size); upa = alloc_size / min_unit_size; while (alloc_size % upa || (offset_in_page(alloc_size / upa))) upa--; max_upa = upa; /* group cpus according to their proximity */ for_each_possible_cpu(cpu) { group = 0; next_group: for_each_possible_cpu(tcpu) { if (cpu == tcpu) break; if (group_map[tcpu] == group && cpu_distance_fn && (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { group++; nr_groups = max(nr_groups, group + 1); goto next_group; } } group_map[cpu] = group; group_cnt[group]++; } /* * Wasted space is caused by a ratio imbalance of upa to group_cnt. * Expand the unit_size until we use >= 75% of the units allocated. * Related to atom_size, which could be much larger than the unit_size. */ last_allocs = INT_MAX; for (upa = max_upa; upa; upa--) { int allocs = 0, wasted = 0; if (alloc_size % upa || (offset_in_page(alloc_size / upa))) continue; for (group = 0; group < nr_groups; group++) { int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); allocs += this_allocs; wasted += this_allocs * upa - group_cnt[group]; } /* * Don't accept if wastage is over 1/3. The * greater-than comparison ensures upa==1 always * passes the following check. */ if (wasted > num_possible_cpus() / 3) continue; /* and then don't consume more memory */ if (allocs > last_allocs) break; last_allocs = allocs; best_upa = upa; } upa = best_upa; /* allocate and fill alloc_info */ for (group = 0; group < nr_groups; group++) nr_units += roundup(group_cnt[group], upa); ai = pcpu_alloc_alloc_info(nr_groups, nr_units); if (!ai) return ERR_PTR(-ENOMEM); cpu_map = ai->groups[0].cpu_map; for (group = 0; group < nr_groups; group++) { ai->groups[group].cpu_map = cpu_map; cpu_map += roundup(group_cnt[group], upa); } ai->static_size = static_size; ai->reserved_size = reserved_size; ai->dyn_size = dyn_size; ai->unit_size = alloc_size / upa; ai->atom_size = atom_size; ai->alloc_size = alloc_size; for (group = 0, unit = 0; group < nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; /* * Initialize base_offset as if all groups are located * back-to-back. The caller should update this to * reflect actual allocation. */ gi->base_offset = unit * ai->unit_size; for_each_possible_cpu(cpu) if (group_map[cpu] == group) gi->cpu_map[gi->nr_units++] = cpu; gi->nr_units = roundup(gi->nr_units, upa); unit += gi->nr_units; } BUG_ON(unit != nr_units); return ai; } #endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ #if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem * @reserved_size: the size of reserved percpu area in bytes * @dyn_size: minimum free size for dynamic allocation in bytes * @atom_size: allocation atom size * @cpu_distance_fn: callback to determine distance between cpus, optional * @alloc_fn: function to allocate percpu page * @free_fn: function to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated * by calling @alloc_fn and used as-is without being mapped into * vmalloc area. Allocations are always whole multiples of @atom_size * aligned to @atom_size. * * This enables the first chunk to piggy back on the linear physical * mapping which often uses larger page size. Please note that this * can result in very sparse cpu->unit mapping on NUMA machines thus * requiring large vmalloc address space. Don't use this allocator if * vmalloc space is not orders of magnitude larger than distances * between node memory addresses (ie. 32bit NUMA machines). * * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit * size, the leftover is returned using @free_fn. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, size_t atom_size, pcpu_fc_cpu_distance_fn_t cpu_distance_fn, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn) { void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; size_t size_sum, areas_size; unsigned long max_distance; int group, i, highest_group, rc = 0; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); if (IS_ERR(ai)) return PTR_ERR(ai); size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); areas = memblock_alloc(areas_size, SMP_CACHE_BYTES); if (!areas) { rc = -ENOMEM; goto out_free; } /* allocate, copy and determine base address & max_distance */ highest_group = 0; for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; unsigned int cpu = NR_CPUS; void *ptr; for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) cpu = gi->cpu_map[i]; BUG_ON(cpu == NR_CPUS); /* allocate space for the whole group */ ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); if (!ptr) { rc = -ENOMEM; goto out_free_areas; } /* kmemleak tracks the percpu allocations separately */ kmemleak_free(ptr); areas[group] = ptr; base = min(ptr, base); if (ptr > areas[highest_group]) highest_group = group; } max_distance = areas[highest_group] - base; max_distance += ai->unit_size * ai->groups[highest_group].nr_units; /* warn if maximum distance is further than 75% of vmalloc space */ if (max_distance > VMALLOC_TOTAL * 3 / 4) { pr_warn("max_distance=0x%lx too large for vmalloc space 0x%lx\n", max_distance, VMALLOC_TOTAL); #ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK /* and fail if we have fallback */ rc = -EINVAL; goto out_free_areas; #endif } /* * Copy data and free unused parts. This should happen after all * allocations are complete; otherwise, we may end up with * overlapping groups. */ for (group = 0; group < ai->nr_groups; group++) { struct pcpu_group_info *gi = &ai->groups[group]; void *ptr = areas[group]; for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { if (gi->cpu_map[i] == NR_CPUS) { /* unused unit, free whole */ free_fn(ptr, ai->unit_size); continue; } /* copy and return the unused part */ memcpy(ptr, __per_cpu_load, ai->static_size); free_fn(ptr + size_sum, ai->unit_size - size_sum); } } /* base address is now known, determine group base offsets */ for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; } pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); pcpu_setup_first_chunk(ai, base); goto out_free; out_free_areas: for (group = 0; group < ai->nr_groups; group++) if (areas[group]) free_fn(areas[group], ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) memblock_free_early(__pa(areas), areas_size); return rc; } #endif /* BUILD_EMBED_FIRST_CHUNK */ #ifdef BUILD_PAGE_FIRST_CHUNK /** * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages * @reserved_size: the size of reserved percpu area in bytes * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE * @free_fn: function to free percpu page, always called with PAGE_SIZE * @populate_pte_fn: function to populate pte * * This is a helper to ease setting up page-remapped first percpu * chunk and can be called where pcpu_setup_first_chunk() is expected. * * This is the basic allocator. Static percpu area is allocated * page-by-page into vmalloc area. * * RETURNS: * 0 on success, -errno on failure. */ int __init pcpu_page_first_chunk(size_t reserved_size, pcpu_fc_alloc_fn_t alloc_fn, pcpu_fc_free_fn_t free_fn, pcpu_fc_populate_pte_fn_t populate_pte_fn) { static struct vm_struct vm; struct pcpu_alloc_info *ai; char psize_str[16]; int unit_pages; size_t pages_size; struct page **pages; int unit, i, j, rc = 0; int upa; int nr_g0_units; snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); if (IS_ERR(ai)) return PTR_ERR(ai); BUG_ON(ai->nr_groups != 1); upa = ai->alloc_size/ai->unit_size; nr_g0_units = roundup(num_possible_cpus(), upa); if (WARN_ON(ai->groups[0].nr_units != nr_g0_units)) { pcpu_free_alloc_info(ai); return -EINVAL; } unit_pages = ai->unit_size >> PAGE_SHIFT; /* unaligned allocations can't be freed, round up to page size */ pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * sizeof(pages[0])); pages = memblock_alloc(pages_size, SMP_CACHE_BYTES); if (!pages) panic("%s: Failed to allocate %zu bytes\n", __func__, pages_size); /* allocate pages */ j = 0; for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned int cpu = ai->groups[0].cpu_map[unit]; for (i = 0; i < unit_pages; i++) { void *ptr; ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); if (!ptr) { pr_warn("failed to allocate %s page for cpu%u\n", psize_str, cpu); goto enomem; } /* kmemleak tracks the percpu allocations separately */ kmemleak_free(ptr); pages[j++] = virt_to_page(ptr); } } /* allocate vm area, map the pages and copy static data */ vm.flags = VM_ALLOC; vm.size = num_possible_cpus() * ai->unit_size; vm_area_register_early(&vm, PAGE_SIZE); for (unit = 0; unit < num_possible_cpus(); unit++) { unsigned long unit_addr = (unsigned long)vm.addr + unit * ai->unit_size; for (i = 0; i < unit_pages; i++) populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); /* pte already populated, the following shouldn't fail */ rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], unit_pages); if (rc < 0) panic("failed to map percpu area, err=%d\n", rc); /* * FIXME: Archs with virtual cache should flush local * cache for the linear mapping here - something * equivalent to flush_cache_vmap() on the local cpu. * flush_cache_vmap() can't be used as most supporting * data structures are not set up yet. */ /* copy static data */ memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: while (--j >= 0) free_fn(page_address(pages[j]), PAGE_SIZE); rc = -ENOMEM; out_free_ar: memblock_free_early(__pa(pages), pages_size); pcpu_free_alloc_info(ai); return rc; } #endif /* BUILD_PAGE_FIRST_CHUNK */ #ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA /* * Generic SMP percpu area setup. * * The embedding helper is used because its behavior closely resembles * the original non-dynamic generic percpu area setup. This is * important because many archs have addressing restrictions and might * fail if the percpu area is located far away from the previous * location. As an added bonus, in non-NUMA cases, embedding is * generally a good idea TLB-wise because percpu area can piggy back * on the physical linear memory mapping which uses large page * mappings on applicable archs. */ unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; EXPORT_SYMBOL(__per_cpu_offset); static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, size_t align) { return memblock_alloc_from(size, align, __pa(MAX_DMA_ADDRESS)); } static void __init pcpu_dfl_fc_free(void *ptr, size_t size) { memblock_free_early(__pa(ptr), size); } void __init setup_per_cpu_areas(void) { unsigned long delta; unsigned int cpu; int rc; /* * Always reserve area for module percpu variables. That's * what the legacy allocator did. */ rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); if (rc < 0) panic("Failed to initialize percpu areas."); delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; for_each_possible_cpu(cpu) __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; } #endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ #else /* CONFIG_SMP */ /* * UP percpu area setup. * * UP always uses km-based percpu allocator with identity mapping. * Static percpu variables are indistinguishable from the usual static * variables and don't require any special preparation. */ void __init setup_per_cpu_areas(void) { const size_t unit_size = roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, PERCPU_DYNAMIC_RESERVE)); struct pcpu_alloc_info *ai; void *fc; ai = pcpu_alloc_alloc_info(1, 1); fc = memblock_alloc_from(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); if (!ai || !fc) panic("Failed to allocate memory for percpu areas."); /* kmemleak tracks the percpu allocations separately */ kmemleak_free(fc); ai->dyn_size = unit_size; ai->unit_size = unit_size; ai->atom_size = unit_size; ai->alloc_size = unit_size; ai->groups[0].nr_units = 1; ai->groups[0].cpu_map[0] = 0; pcpu_setup_first_chunk(ai, fc); pcpu_free_alloc_info(ai); } #endif /* CONFIG_SMP */ /* * pcpu_nr_pages - calculate total number of populated backing pages * * This reflects the number of pages populated to back chunks. Metadata is * excluded in the number exposed in meminfo as the number of backing pages * scales with the number of cpus and can quickly outweigh the memory used for * metadata. It also keeps this calculation nice and simple. * * RETURNS: * Total number of populated backing pages in use by the allocator. */ unsigned long pcpu_nr_pages(void) { return pcpu_nr_populated * pcpu_nr_units; } /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up * and running. */ static int __init percpu_enable_async(void) { pcpu_async_enabled = true; return 0; } subsys_initcall(percpu_enable_async);
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/block_dev.c * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE */ #include <linux/init.h> #include <linux/mm.h> #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/major.h> #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> #include <linux/buffer_head.h> #include <linux/swap.h> #include <linux/pagevec.h> #include <linux/writeback.h> #include <linux/mpage.h> #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/uio.h> #include <linux/namei.h> #include <linux/log2.h> #include <linux/cleancache.h> #include <linux/task_io_accounting_ops.h> #include <linux/falloc.h> #include <linux/uaccess.h> #include <linux/suspend.h> #include "internal.h" struct bdev_inode { struct block_device bdev; struct inode vfs_inode; }; static const struct address_space_operations def_blk_aops; static inline struct bdev_inode *BDEV_I(struct inode *inode) { return container_of(inode, struct bdev_inode, vfs_inode); } struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } EXPORT_SYMBOL(I_BDEV); static void bdev_write_inode(struct block_device *bdev) { struct inode *inode = bdev->bd_inode; int ret; spin_lock(&inode->i_lock); while (inode->i_state & I_DIRTY) { spin_unlock(&inode->i_lock); ret = write_inode_now(inode, true); if (ret) { char name[BDEVNAME_SIZE]; pr_warn_ratelimited("VFS: Dirty inode writeback failed " "for block device %s (err=%d).\n", bdevname(bdev, name), ret); } spin_lock(&inode->i_lock); } spin_unlock(&inode->i_lock); } /* Kill _all_ buffers and pagecache , dirty or not.. */ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; invalidate_bh_lrus(); truncate_inode_pages(mapping, 0); } /* Invalidate clean unused buffers and pagecache. */ void invalidate_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; if (mapping->nrpages) { invalidate_bh_lrus(); lru_add_drain_all(); /* make sure all lru add caches are flushed */ invalidate_mapping_pages(mapping, 0, -1); } /* 99% of the time, we don't need to flush the cleancache on the bdev. * But, for the strange corners, lets be cautious */ cleancache_invalidate_inode(mapping); } EXPORT_SYMBOL(invalidate_bdev); /* * Drop all buffers & page cache for given bdev range. This function bails * with error if bdev has other exclusive owner (such as filesystem). */ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, loff_t lstart, loff_t lend) { struct block_device *claimed_bdev = NULL; int err; /* * If we don't hold exclusive handle for the device, upgrade to it * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ if (!(mode & FMODE_EXCL)) { claimed_bdev = bdev->bd_contains; err = bd_prepare_to_claim(bdev, claimed_bdev, truncate_bdev_range); if (err) goto invalidate; } truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); if (claimed_bdev) bd_abort_claiming(bdev, claimed_bdev, truncate_bdev_range); return 0; invalidate: /* * Someone else has handle exclusively open. Try invalidating instead. * The 'end' argument is inclusive so the rounding is safe. */ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, lstart >> PAGE_SHIFT, lend >> PAGE_SHIFT); } EXPORT_SYMBOL(truncate_bdev_range); static void set_init_blocksize(struct block_device *bdev) { unsigned int bsize = bdev_logical_block_size(bdev); loff_t size = i_size_read(bdev->bd_inode); while (bsize < PAGE_SIZE) { if (size & bsize) break; bsize <<= 1; } bdev->bd_inode->i_blkbits = blksize_bits(bsize); } int set_blocksize(struct block_device *bdev, int size) { /* Size must be a power of two, and between 512 and PAGE_SIZE */ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ if (size < bdev_logical_block_size(bdev)) return -EINVAL; /* Don't change the size if it is same as current */ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { sync_blockdev(bdev); bdev->bd_inode->i_blkbits = blksize_bits(size); kill_bdev(bdev); } return 0; } EXPORT_SYMBOL(set_blocksize); int sb_set_blocksize(struct super_block *sb, int size) { if (set_blocksize(sb->s_bdev, size)) return 0; /* If we get here, we know size is power of two * and it's value is between 512 and PAGE_SIZE */ sb->s_blocksize = size; sb->s_blocksize_bits = blksize_bits(size); return sb->s_blocksize; } EXPORT_SYMBOL(sb_set_blocksize); int sb_min_blocksize(struct super_block *sb, int size) { int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) size = minsize; return sb_set_blocksize(sb, size); } EXPORT_SYMBOL(sb_min_blocksize); static int blkdev_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh, int create) { bh->b_bdev = I_BDEV(inode); bh->b_blocknr = iblock; set_buffer_mapped(bh); return 0; } static struct inode *bdev_file_inode(struct file *file) { return file->f_mapping->host; } static unsigned int dio_bio_write_op(struct kiocb *iocb) { unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; /* avoid the need for a I/O completion work item */ if (iocb->ki_flags & IOCB_DSYNC) op |= REQ_FUA; return op; } #define DIO_INLINE_BIO_VECS 4 static void blkdev_bio_end_io_simple(struct bio *bio) { struct task_struct *waiter = bio->bi_private; WRITE_ONCE(bio->bi_private, NULL); blk_wake_io_task(waiter); } static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) { struct file *file = iocb->ki_filp; struct block_device *bdev = I_BDEV(bdev_file_inode(file)); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; struct bio bio; ssize_t ret; blk_qc_t qc; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; if (nr_pages <= DIO_INLINE_BIO_VECS) vecs = inline_vecs; else { vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), GFP_KERNEL); if (!vecs) return -ENOMEM; } bio_init(&bio, vecs, nr_pages); bio_set_dev(&bio, bdev); bio.bi_iter.bi_sector = pos >> 9; bio.bi_write_hint = iocb->ki_hint; bio.bi_private = current; bio.bi_end_io = blkdev_bio_end_io_simple; bio.bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(&bio, iter); if (unlikely(ret)) goto out; ret = bio.bi_iter.bi_size; if (iov_iter_rw(iter) == READ) { bio.bi_opf = REQ_OP_READ; if (iter_is_iovec(iter)) should_dirty = true; } else { bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } if (iocb->ki_flags & IOCB_NOWAIT) bio.bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); qc = submit_bio(&bio); for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(bio.bi_private)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); bio_release_pages(&bio, should_dirty); if (unlikely(bio.bi_status)) ret = blk_status_to_errno(bio.bi_status); out: if (vecs != inline_vecs) kfree(vecs); bio_uninit(&bio); return ret; } struct blkdev_dio { union { struct kiocb *iocb; struct task_struct *waiter; }; size_t size; atomic_t ref; bool multi_bio : 1; bool should_dirty : 1; bool is_sync : 1; struct bio bio; }; static struct bio_set blkdev_dio_pool; static int blkdev_iopoll(struct kiocb *kiocb, bool wait) { struct block_device *bdev = I_BDEV(kiocb->ki_filp->f_mapping->host); struct request_queue *q = bdev_get_queue(bdev); return blk_poll(q, READ_ONCE(kiocb->ki_cookie), wait); } static void blkdev_bio_end_io(struct bio *bio) { struct blkdev_dio *dio = bio->bi_private; bool should_dirty = dio->should_dirty; if (bio->bi_status && !dio->bio.bi_status) dio->bio.bi_status = bio->bi_status; if (!dio->multi_bio || atomic_dec_and_test(&dio->ref)) { if (!dio->is_sync) { struct kiocb *iocb = dio->iocb; ssize_t ret; if (likely(!dio->bio.bi_status)) { ret = dio->size; iocb->ki_pos += ret; } else { ret = blk_status_to_errno(dio->bio.bi_status); } dio->iocb->ki_complete(iocb, ret, 0); if (dio->multi_bio) bio_put(&dio->bio); } else { struct task_struct *waiter = dio->waiter; WRITE_ONCE(dio->waiter, NULL); blk_wake_io_task(waiter); } } if (should_dirty) { bio_check_pages_dirty(bio); } else { bio_release_pages(bio, false); bio_put(bio); } } static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages) { struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); struct block_device *bdev = I_BDEV(inode); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; bool is_poll = (iocb->ki_flags & IOCB_HIPRI) != 0; bool is_read = (iov_iter_rw(iter) == READ), is_sync; loff_t pos = iocb->ki_pos; blk_qc_t qc = BLK_QC_T_NONE; int ret = 0; if ((pos | iov_iter_alignment(iter)) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, &blkdev_dio_pool); dio = container_of(bio, struct blkdev_dio, bio); dio->is_sync = is_sync = is_sync_kiocb(iocb); if (dio->is_sync) { dio->waiter = current; bio_get(bio); } else { dio->iocb = iocb; } dio->size = 0; dio->multi_bio = false; dio->should_dirty = is_read && iter_is_iovec(iter); /* * Don't plug for HIPRI/polled IO, as those should go straight * to issue */ if (!is_poll) blk_start_plug(&plug); for (;;) { bio_set_dev(bio, bdev); bio->bi_iter.bi_sector = pos >> 9; bio->bi_write_hint = iocb->ki_hint; bio->bi_private = dio; bio->bi_end_io = blkdev_bio_end_io; bio->bi_ioprio = iocb->ki_ioprio; ret = bio_iov_iter_get_pages(bio, iter); if (unlikely(ret)) { bio->bi_status = BLK_STS_IOERR; bio_endio(bio); break; } if (is_read) { bio->bi_opf = REQ_OP_READ; if (dio->should_dirty) bio_set_pages_dirty(bio); } else { bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } if (iocb->ki_flags & IOCB_NOWAIT) bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES); if (!nr_pages) { bool polled = false; if (iocb->ki_flags & IOCB_HIPRI) { bio_set_polled(bio, iocb); polled = true; } qc = submit_bio(bio); if (polled) WRITE_ONCE(iocb->ki_cookie, qc); break; } if (!dio->multi_bio) { /* * AIO needs an extra reference to ensure the dio * structure which is embedded into the first bio * stays around. */ if (!is_sync) bio_get(bio); dio->multi_bio = true; atomic_set(&dio->ref, 2); } else { atomic_inc(&dio->ref); } submit_bio(bio); bio = bio_alloc(GFP_KERNEL, nr_pages); } if (!is_poll) blk_finish_plug(&plug); if (!is_sync) return -EIOCBQUEUED; for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); if (!READ_ONCE(dio->waiter)) break; if (!(iocb->ki_flags & IOCB_HIPRI) || !blk_poll(bdev_get_queue(bdev), qc, true)) blk_io_schedule(); } __set_current_state(TASK_RUNNING); if (!ret) ret = blk_status_to_errno(dio->bio.bi_status); if (likely(!ret)) ret = dio->size; bio_put(&dio->bio); return ret; } static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { int nr_pages; nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1); if (!nr_pages) return 0; if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES) return __blkdev_direct_IO_simple(iocb, iter, nr_pages); return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES)); } static __init int blkdev_init(void) { return bioset_init(&blkdev_dio_pool, 4, offsetof(struct blkdev_dio, bio), BIOSET_NEED_BVECS); } module_init(blkdev_init); int __sync_blockdev(struct block_device *bdev, int wait) { if (!bdev) return 0; if (!wait) return filemap_flush(bdev->bd_inode->i_mapping); return filemap_write_and_wait(bdev->bd_inode->i_mapping); } /* * Write out and wait upon all the dirty data associated with a block * device via its mapping. Does not take the superblock lock. */ int sync_blockdev(struct block_device *bdev) { return __sync_blockdev(bdev, 1); } EXPORT_SYMBOL(sync_blockdev); /* * Write out and wait upon all dirty data associated with this * device. Filesystem data as well as the underlying block * device. Takes the superblock lock. */ int fsync_bdev(struct block_device *bdev) { struct super_block *sb = get_super(bdev); if (sb) { int res = sync_filesystem(sb); drop_super(sb); return res; } return sync_blockdev(bdev); } EXPORT_SYMBOL(fsync_bdev); /** * freeze_bdev -- lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple * freeze requests arrive simultaneously. It counts up in freeze_bdev() and * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze * actually. */ struct super_block *freeze_bdev(struct block_device *bdev) { struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); if (++bdev->bd_fsfreeze_count > 1) { /* * We don't even need to grab a reference - the first call * to freeze_bdev grab an active reference and only the last * thaw_bdev drops it. */ sb = get_super(bdev); if (sb) drop_super(sb); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } sb = get_active_super(bdev); if (!sb) goto out; if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb); else error = freeze_super(sb); if (error) { deactivate_super(sb); bdev->bd_fsfreeze_count--; mutex_unlock(&bdev->bd_fsfreeze_mutex); return ERR_PTR(error); } deactivate_super(sb); out: sync_blockdev(bdev); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; /* thaw_bdev releases s->s_umount */ } EXPORT_SYMBOL(freeze_bdev); /** * thaw_bdev -- unlock filesystem * @bdev: blockdevice to unlock * @sb: associated superblock * * Unlocks the filesystem and marks it writeable again after freeze_bdev(). */ int thaw_bdev(struct block_device *bdev, struct super_block *sb) { int error = -EINVAL; mutex_lock(&bdev->bd_fsfreeze_mutex); if (!bdev->bd_fsfreeze_count) goto out; error = 0; if (--bdev->bd_fsfreeze_count > 0) goto out; if (!sb) goto out; if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb); else error = thaw_super(sb); if (error) bdev->bd_fsfreeze_count++; out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } EXPORT_SYMBOL(thaw_bdev); static int blkdev_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, blkdev_get_block, wbc); } static int blkdev_readpage(struct file * file, struct page * page) { return block_read_full_page(page, blkdev_get_block); } static void blkdev_readahead(struct readahead_control *rac) { mpage_readahead(rac, blkdev_get_block); } static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { return block_write_begin(mapping, pos, len, flags, pagep, blkdev_get_block); } static int blkdev_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { int ret; ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); unlock_page(page); put_page(page); return ret; } /* * private llseek: * for a block special file file_inode(file)->i_size is zero * so we compute the size by hand (just as in block_read/write above) */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { struct inode *bd_inode = bdev_file_inode(file); loff_t retval; inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); inode_unlock(bd_inode); return retval; } int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; error = file_write_and_wait_range(filp, start, end); if (error) return error; /* * There is no need to serialise calls to blkdev_issue_flush with * i_mutex and doing so causes performance issues with concurrent * O_SYNC writers to a block device. */ error = blkdev_issue_flush(bdev, GFP_KERNEL); if (error == -EOPNOTSUPP) error = 0; return error; } EXPORT_SYMBOL(blkdev_fsync); /** * bdev_read_page() - Start reading a page from a block device * @bdev: The device to read the page from * @sector: The offset on the device to read the page to (need not be aligned) * @page: The page to read * * On entry, the page should be locked. It will be unlocked when the page * has been read. If the block driver implements rw_page synchronously, * that will be true on exit from this function, but it need not be. * * Errors returned by this function are usually "soft", eg out of memory, or * queue full; callers should try a different route to read this page rather * than propagate an error back up the stack. * * Return: negative errno if an error occurs, 0 if submission was successful. */ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; int result = -EOPNOTSUPP; if (!ops->rw_page || bdev_get_integrity(bdev)) return result; result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_READ); blk_queue_exit(bdev->bd_disk->queue); return result; } /** * bdev_write_page() - Start writing a page to a block device * @bdev: The device to write the page to * @sector: The offset on the device to write the page to (need not be aligned) * @page: The page to write * @wbc: The writeback_control for the write * * On entry, the page should be locked and not currently under writeback. * On exit, if the write started successfully, the page will be unlocked and * under writeback. If the write failed already (eg the driver failed to * queue the page to the device), the page will still be locked. If the * caller is a ->writepage implementation, it will need to unlock the page. * * Errors returned by this function are usually "soft", eg out of memory, or * queue full; callers should try a different route to write this page rather * than propagate an error back up the stack. * * Return: negative errno if an error occurs, 0 if submission was successful. */ int bdev_write_page(struct block_device *bdev, sector_t sector, struct page *page, struct writeback_control *wbc) { int result; const struct block_device_operations *ops = bdev->bd_disk->fops; if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; result = blk_queue_enter(bdev->bd_disk->queue, 0); if (result) return result; set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, REQ_OP_WRITE); if (result) { end_page_writeback(page); } else { clean_page_buffers(page); unlock_page(page); } blk_queue_exit(bdev->bd_disk->queue); return result; } /* * pseudo-fs */ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) { struct bdev_inode *ei = kmem_cache_alloc(bdev_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; } static void bdev_free_inode(struct inode *inode) { kmem_cache_free(bdev_cachep, BDEV_I(inode)); } static void init_once(void *foo) { struct bdev_inode *ei = (struct bdev_inode *) foo; struct block_device *bdev = &ei->bdev; memset(bdev, 0, sizeof(*bdev)); mutex_init(&bdev->bd_mutex); #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); } static void bdev_evict_inode(struct inode *inode) { struct block_device *bdev = &BDEV_I(inode)->bdev; truncate_inode_pages_final(&inode->i_data); invalidate_inode_buffers(inode); /* is it needed here? */ clear_inode(inode); /* Detach inode from wb early as bdi_put() may free bdi->wb */ inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); bdev->bd_bdi = &noop_backing_dev_info; } } static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, .drop_inode = generic_delete_inode, .evict_inode = bdev_evict_inode, }; static int bd_init_fs_context(struct fs_context *fc) { struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); if (!ctx) return -ENOMEM; fc->s_iflags |= SB_I_CGROUPWB; ctx->ops = &bdev_sops; return 0; } static struct file_system_type bd_type = { .name = "bdev", .init_fs_context = bd_init_fs_context, .kill_sb = kill_anon_super, }; struct super_block *blockdev_superblock __read_mostly; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; static struct vfsmount *bd_mnt; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) panic("Cannot register bdev pseudo-fs"); bd_mnt = kern_mount(&bd_type); if (IS_ERR(bd_mnt)) panic("Cannot create bdev pseudo-fs"); blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ } /* * Most likely _very_ bad one - but then it's hardly critical for small * /dev and can be fixed when somebody will need really large one. * Keep in mind that it will be fed through icache hash function too. */ static inline unsigned long hash(dev_t dev) { return MAJOR(dev)+MINOR(dev); } static int bdev_test(struct inode *inode, void *data) { return BDEV_I(inode)->bdev.bd_dev == *(dev_t *)data; } static int bdev_set(struct inode *inode, void *data) { BDEV_I(inode)->bdev.bd_dev = *(dev_t *)data; return 0; } static struct block_device *bdget(dev_t dev) { struct block_device *bdev; struct inode *inode; inode = iget5_locked(blockdev_superblock, hash(dev), bdev_test, bdev_set, &dev); if (!inode) return NULL; bdev = &BDEV_I(inode)->bdev; if (inode->i_state & I_NEW) { spin_lock_init(&bdev->bd_size_lock); bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; bdev->bd_part_count = 0; inode->i_mode = S_IFBLK; inode->i_rdev = dev; inode->i_bdev = bdev; inode->i_data.a_ops = &def_blk_aops; mapping_set_gfp_mask(&inode->i_data, GFP_USER); unlock_new_inode(inode); } return bdev; } /** * bdgrab -- Grab a reference to an already referenced block device * @bdev: Block device to grab a reference to. */ struct block_device *bdgrab(struct block_device *bdev) { ihold(bdev->bd_inode); return bdev; } EXPORT_SYMBOL(bdgrab); struct block_device *bdget_part(struct hd_struct *part) { return bdget(part_devt(part)); } long nr_blockdev_pages(void) { struct inode *inode; long ret = 0; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) ret += inode->i_mapping->nrpages; spin_unlock(&blockdev_superblock->s_inode_list_lock); return ret; } void bdput(struct block_device *bdev) { iput(bdev->bd_inode); } EXPORT_SYMBOL(bdput); static struct block_device *bd_acquire(struct inode *inode) { struct block_device *bdev; spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev && !inode_unhashed(bdev->bd_inode)) { bdgrab(bdev); spin_unlock(&bdev_lock); return bdev; } spin_unlock(&bdev_lock); /* * i_bdev references block device inode that was already shut down * (corresponding device got removed). Remove the reference and look * up block device inode again just in case new device got * reestablished under the same device number. */ if (bdev) bd_forget(inode); bdev = bdget(inode->i_rdev); if (bdev) { spin_lock(&bdev_lock); if (!inode->i_bdev) { /* * We take an additional reference to bd_inode, * and it's released in clear_inode() of inode. * So, we can access it via ->i_mapping always * without igrab(). */ bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; } spin_unlock(&bdev_lock); } return bdev; } /* Call when you free inode */ void bd_forget(struct inode *inode) { struct block_device *bdev = NULL; spin_lock(&bdev_lock); if (!sb_is_blkdev_sb(inode->i_sb)) bdev = inode->i_bdev; inode->i_bdev = NULL; inode->i_mapping = &inode->i_data; spin_unlock(&bdev_lock); if (bdev) bdput(bdev); } /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest * @whole: whole block device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * * Test whether @bdev can be claimed by @holder. * * CONTEXT: * spin_lock(&bdev_lock). * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, void *holder) { if (bdev->bd_holder == holder) return true; /* already a holder */ else if (bdev->bd_holder != NULL) return false; /* held by someone else */ else if (whole == bdev) return true; /* is a whole device which isn't held */ else if (whole->bd_holder == bd_may_claim) return true; /* is a partition of a device that is being partitioned */ else if (whole->bd_holder != NULL) return false; /* is a partition of a held device */ else return true; /* is a partition of an un-held device */ } /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @whole: the whole device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev * * Claim @bdev. This function fails if @bdev is already claimed by another * holder and waits if another claiming is in progress. return, the caller * has ownership of bd_claiming and bd_holder[s]. * * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ int bd_prepare_to_claim(struct block_device *bdev, struct block_device *whole, void *holder) { retry: spin_lock(&bdev_lock); /* if someone else claimed, fail */ if (!bd_may_claim(bdev, whole, holder)) { spin_unlock(&bdev_lock); return -EBUSY; } /* if claiming is already in progress, wait for it to finish */ if (whole->bd_claiming) { wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); goto retry; } /* yay, all mine */ whole->bd_claiming = holder; spin_unlock(&bdev_lock); return 0; } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ static struct gendisk *bdev_get_gendisk(struct block_device *bdev, int *partno) { struct gendisk *disk = get_gendisk(bdev->bd_dev, partno); if (!disk) return NULL; /* * Now that we hold gendisk reference we make sure bdev we looked up is * not stale. If it is, it means device got removed and created before * we looked up gendisk and we fail open in such case. Associating * unhashed bdev with newly created gendisk could lead to two bdevs * (and thus two independent caches) being associated with one device * which is bad. */ if (inode_unhashed(bdev->bd_inode)) { put_disk_and_module(disk); return NULL; } return disk; } static void bd_clear_claiming(struct block_device *whole, void *holder) { lockdep_assert_held(&bdev_lock); /* tell others that we're done */ BUG_ON(whole->bd_claiming != holder); whole->bd_claiming = NULL; wake_up_bit(&whole->bd_claiming, 0); } /** * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest * @whole: whole block device * @holder: holder that has claimed @bdev * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ static void bd_finish_claiming(struct block_device *bdev, struct block_device *whole, void *holder) { spin_lock(&bdev_lock); BUG_ON(!bd_may_claim(bdev, whole, holder)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder */ whole->bd_holders++; whole->bd_holder = bd_may_claim; bdev->bd_holders++; bdev->bd_holder = holder; bd_clear_claiming(whole, holder); spin_unlock(&bdev_lock); } /** * bd_abort_claiming - abort claiming of a block device * @bdev: block device of interest * @whole: whole block device * @holder: holder that has claimed @bdev * * Abort claiming of a block device when the exclusive open failed. This can be * also used when exclusive open is not actually desired and we just needed * to block other exclusive openers for a while. */ void bd_abort_claiming(struct block_device *bdev, struct block_device *whole, void *holder) { spin_lock(&bdev_lock); bd_clear_claiming(whole, holder); spin_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); #ifdef CONFIG_SYSFS struct bd_holder_disk { struct list_head list; struct gendisk *disk; int refcnt; }; static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; list_for_each_entry(holder, &bdev->bd_holder_disks, list) if (holder->disk == disk) return holder; return NULL; } static int add_symlink(struct kobject *from, struct kobject *to) { return sysfs_create_link(from, to, kobject_name(to)); } static void del_symlink(struct kobject *from, struct kobject *to) { sysfs_remove_link(from, kobject_name(to)); } /** * bd_link_disk_holder - create symlinks between holding disk and slave bdev * @bdev: the claimed slave bdev * @disk: the holding disk * * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. * * This functions creates the following sysfs symlinks. * * - from "slaves" directory of the holder @disk to the claimed @bdev * - from "holders" directory of the @bdev to the holder @disk * * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is * passed to bd_link_disk_holder(), then: * * /sys/block/dm-0/slaves/sda --> /sys/block/sda * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 * * The caller must have claimed @bdev before calling this function and * ensure that both @bdev and @disk are valid during the creation and * lifetime of these symlinks. * * CONTEXT: * Might sleep. * * RETURNS: * 0 on success, -errno on failure. */ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; int ret = 0; mutex_lock(&bdev->bd_mutex); WARN_ON_ONCE(!bdev->bd_holder); /* FIXME: remove the following once add_disk() handles errors */ if (WARN_ON(!disk->slave_dir || !bdev->bd_part->holder_dir)) goto out_unlock; holder = bd_find_holder_disk(bdev, disk); if (holder) { holder->refcnt++; goto out_unlock; } holder = kzalloc(sizeof(*holder), GFP_KERNEL); if (!holder) { ret = -ENOMEM; goto out_unlock; } INIT_LIST_HEAD(&holder->list); holder->disk = disk; holder->refcnt = 1; ret = add_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); if (ret) goto out_free; ret = add_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); if (ret) goto out_del; /* * bdev could be deleted beneath us which would implicitly destroy * the holder directory. Hold on to it. */ kobject_get(bdev->bd_part->holder_dir); list_add(&holder->list, &bdev->bd_holder_disks); goto out_unlock; out_del: del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); out_free: kfree(holder); out_unlock: mutex_unlock(&bdev->bd_mutex); return ret; } EXPORT_SYMBOL_GPL(bd_link_disk_holder); /** * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() * @bdev: the calimed slave bdev * @disk: the holding disk * * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. * * CONTEXT: * Might sleep. */ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) { struct bd_holder_disk *holder; mutex_lock(&bdev->bd_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, &part_to_dev(bdev->bd_part)->kobj); del_symlink(bdev->bd_part->holder_dir, &disk_to_dev(disk)->kobj); kobject_put(bdev->bd_part->holder_dir); list_del_init(&holder->list); kfree(holder); } mutex_unlock(&bdev->bd_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); #endif /** * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ static void check_disk_size_change(struct gendisk *disk, struct block_device *bdev, bool verbose) { loff_t disk_size, bdev_size; spin_lock(&bdev->bd_size_lock); disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { if (verbose) { printk(KERN_INFO "%s: detected capacity change from %lld to %lld\n", disk->disk_name, bdev_size, disk_size); } i_size_write(bdev->bd_inode, disk_size); } spin_unlock(&bdev->bd_size_lock); if (bdev_size > disk_size) { if (__invalidate_device(bdev, false)) pr_warn("VFS: busy inodes on resized disk %s\n", disk->disk_name); } } /** * revalidate_disk_size - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @verbose: if %true log a message about a size change if there is any * * This routine checks to see if the bdev size does not match the disk size * and adjusts it if it differs. When shrinking the bdev size, its all caches * are freed. */ void revalidate_disk_size(struct gendisk *disk, bool verbose) { struct block_device *bdev; /* * Hidden disks don't have associated bdev so there's no point in * revalidating them. */ if (disk->flags & GENHD_FL_HIDDEN) return; bdev = bdget_disk(disk, 0); if (bdev) { check_disk_size_change(disk, bdev, verbose); bdput(bdev); } } EXPORT_SYMBOL(revalidate_disk_size); void bd_set_nr_sectors(struct block_device *bdev, sector_t sectors) { spin_lock(&bdev->bd_size_lock); i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); spin_unlock(&bdev->bd_size_lock); } EXPORT_SYMBOL(bd_set_nr_sectors); static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) { struct gendisk *disk = bdev->bd_disk; int ret; lockdep_assert_held(&bdev->bd_mutex); if (!(disk->flags & GENHD_FL_UP)) return -ENXIO; rescan: ret = blk_drop_partitions(bdev); if (ret) return ret; clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that * support partitions (independ of actually having partitions created). * Doing that is rather inconsistent, but changing it broke legacy * udisks polling for legacy ide-cdrom devices. Use the crude check * below to get the sane behavior for most device while not breaking * userspace for this particular setup. */ if (invalidate) { if (disk_part_scan_enabled(disk) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); } else { if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); } check_disk_size_change(disk, bdev, !invalidate); if (get_capacity(disk)) { ret = blk_add_partitions(disk, bdev); if (ret == -EAGAIN) goto rescan; } else if (invalidate) { /* * Tell userspace that the media / partition table may have * changed. */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); } return ret; } /* * Only exported for for loop and dasd for historic reasons. Don't use in new * code! */ EXPORT_SYMBOL_GPL(bdev_disk_changed); /* * bd_mutex locking: * * mutex_lock(part->bd_mutex) * mutex_lock_nested(whole->bd_mutex, 1) */ static int __blkdev_get(struct block_device *bdev, fmode_t mode, void *holder, int for_part) { struct block_device *whole = NULL, *claiming = NULL; struct gendisk *disk; int ret; int partno; bool first_open = false, unblock_events = true, need_restart; restart: need_restart = false; ret = -ENXIO; disk = bdev_get_gendisk(bdev, &partno); if (!disk) goto out; if (partno) { whole = bdget_disk(disk, 0); if (!whole) { ret = -ENOMEM; goto out_put_disk; } } if (!for_part && (mode & FMODE_EXCL)) { WARN_ON_ONCE(!holder); if (whole) claiming = whole; else claiming = bdev; ret = bd_prepare_to_claim(bdev, claiming, holder); if (ret) goto out_put_whole; } disk_block_events(disk); mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { first_open = true; bdev->bd_disk = disk; bdev->bd_contains = bdev; bdev->bd_partno = partno; if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); if (!bdev->bd_part) goto out_clear; ret = 0; if (disk->fops->open) { ret = disk->fops->open(bdev, mode); /* * If we lost a race with 'disk' being deleted, * try again. See md.c */ if (ret == -ERESTARTSYS) need_restart = true; } if (!ret) { bd_set_nr_sectors(bdev, get_capacity(disk)); set_init_blocksize(bdev); } /* * If the device is invalidated, rescan partition * if open succeeded or failed with -ENOMEDIUM. * The latter is necessary to prevent ghost * partitions on a removed medium. */ if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_clear; } else { BUG_ON(for_part); ret = __blkdev_get(whole, mode, NULL, 1); if (ret) goto out_clear; bdev->bd_contains = bdgrab(whole); bdev->bd_part = disk_get_part(disk, partno); if (!(disk->flags & GENHD_FL_UP) || !bdev->bd_part || !bdev->bd_part->nr_sects) { ret = -ENXIO; goto out_clear; } bd_set_nr_sectors(bdev, bdev->bd_part->nr_sects); set_init_blocksize(bdev); } if (bdev->bd_bdi == &noop_backing_dev_info) bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (bdev->bd_contains == bdev) { ret = 0; if (bdev->bd_disk->fops->open) ret = bdev->bd_disk->fops->open(bdev, mode); /* the same as first opener case, read comment there */ if (test_bit(GD_NEED_PART_SCAN, &disk->state) && (!ret || ret == -ENOMEDIUM)) bdev_disk_changed(bdev, ret == -ENOMEDIUM); if (ret) goto out_unlock_bdev; } } bdev->bd_openers++; if (for_part) bdev->bd_part_count++; if (claiming) bd_finish_claiming(bdev, claiming, holder); /* * Block event polling for write claims if requested. Any write holder * makes the write_holder state stick until all are released. This is * good enough and tracking individual writeable reference is too * fragile given the way @mode is used in blkdev_get/put(). */ if (claiming && (mode & FMODE_WRITE) && !bdev->bd_write_holder && (disk->flags & GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; } mutex_unlock(&bdev->bd_mutex); if (unblock_events) disk_unblock_events(disk); /* only one opener holds refs to the module and disk */ if (!first_open) put_disk_and_module(disk); if (whole) bdput(whole); return 0; out_clear: disk_put_part(bdev->bd_part); bdev->bd_disk = NULL; bdev->bd_part = NULL; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; out_unlock_bdev: if (claiming) bd_abort_claiming(bdev, claiming, holder); mutex_unlock(&bdev->bd_mutex); disk_unblock_events(disk); out_put_whole: if (whole) bdput(whole); out_put_disk: put_disk_and_module(disk); if (need_restart) goto restart; out: return ret; } /** * blkdev_get - open a block device * @bdev: block_device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * * Open @bdev with @mode. If @mode includes %FMODE_EXCL, @bdev is * open with exclusive access. Specifying %FMODE_EXCL with %NULL * @holder is invalid. Exclusive opens may nest for the same @holder. * * On success, the reference count of @bdev is unchanged. On failure, * @bdev is put. * * CONTEXT: * Might sleep. * * RETURNS: * 0 on success, -errno on failure. */ static int blkdev_get(struct block_device *bdev, fmode_t mode, void *holder) { int ret, perm = 0; if (mode & FMODE_READ) perm |= MAY_READ; if (mode & FMODE_WRITE) perm |= MAY_WRITE; ret = devcgroup_inode_permission(bdev->bd_inode, perm); if (ret) goto bdput; ret =__blkdev_get(bdev, mode, holder, 0); if (ret) goto bdput; return 0; bdput: bdput(bdev); return ret; } /** * blkdev_get_by_path - open a block device by name * @path: path to the block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * * Open the blockdevice described by the device file at @path. @mode * and @holder are identical to blkdev_get(). * * On success, the returned block_device has reference count of one. * * CONTEXT: * Might sleep. * * RETURNS: * Pointer to block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, void *holder) { struct block_device *bdev; int err; bdev = lookup_bdev(path); if (IS_ERR(bdev)) return bdev; err = blkdev_get(bdev, mode, holder); if (err) return ERR_PTR(err); if ((mode & FMODE_WRITE) && bdev_read_only(bdev)) { blkdev_put(bdev, mode); return ERR_PTR(-EACCES); } return bdev; } EXPORT_SYMBOL(blkdev_get_by_path); /** * blkdev_get_by_dev - open a block device by device number * @dev: device number of block device to open * @mode: FMODE_* mask * @holder: exclusive holder identifier * * Open the blockdevice described by device number @dev. @mode and * @holder are identical to blkdev_get(). * * Use it ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a * device number. _Never_ to be used for internal purposes. If you * ever need it - reconsider your API. * * On success, the returned block_device has reference count of one. * * CONTEXT: * Might sleep. * * RETURNS: * Pointer to block_device on success, ERR_PTR(-errno) on failure. */ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) { struct block_device *bdev; int err; bdev = bdget(dev); if (!bdev) return ERR_PTR(-ENOMEM); err = blkdev_get(bdev, mode, holder); if (err) return ERR_PTR(err); return bdev; } EXPORT_SYMBOL(blkdev_get_by_dev); static int blkdev_open(struct inode * inode, struct file * filp) { struct block_device *bdev; /* * Preserve backwards compatibility and allow large file access * even if userspace doesn't ask for it explicitly. Some mkfs * binary needs it. We might want to drop this workaround * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; if (filp->f_flags & O_NDELAY) filp->f_mode |= FMODE_NDELAY; if (filp->f_flags & O_EXCL) filp->f_mode |= FMODE_EXCL; if ((filp->f_flags & O_ACCMODE) == 3) filp->f_mode |= FMODE_WRITE_IOCTL; bdev = bd_acquire(inode); if (bdev == NULL) return -ENOMEM; filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return blkdev_get(bdev, filp->f_mode, filp); } static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) { struct gendisk *disk = bdev->bd_disk; struct block_device *victim = NULL; /* * Sync early if it looks like we're the last one. If someone else * opens the block device between now and the decrement of bd_openers * then we did a sync that we didn't need to, but that's not the end * of the world and we want to avoid long (could be several minute) * syncs while holding the mutex. */ if (bdev->bd_openers == 1) sync_blockdev(bdev); mutex_lock_nested(&bdev->bd_mutex, for_part); if (for_part) bdev->bd_part_count--; if (!--bdev->bd_openers) { WARN_ON_ONCE(bdev->bd_holders); sync_blockdev(bdev); kill_bdev(bdev); bdev_write_inode(bdev); } if (bdev->bd_contains == bdev) { if (disk->fops->release) disk->fops->release(disk, mode); } if (!bdev->bd_openers) { disk_put_part(bdev->bd_part); bdev->bd_part = NULL; bdev->bd_disk = NULL; if (bdev != bdev->bd_contains) victim = bdev->bd_contains; bdev->bd_contains = NULL; put_disk_and_module(disk); } mutex_unlock(&bdev->bd_mutex); bdput(bdev); if (victim) __blkdev_put(victim, mode, 1); } void blkdev_put(struct block_device *bdev, fmode_t mode) { mutex_lock(&bdev->bd_mutex); if (mode & FMODE_EXCL) { bool bdev_free; /* * Release a claim on the device. The holder fields * are protected with bdev_lock. bd_mutex is to * synchronize disk_holder unlinking. */ spin_lock(&bdev_lock); WARN_ON_ONCE(--bdev->bd_holders < 0); WARN_ON_ONCE(--bdev->bd_contains->bd_holders < 0); /* bd_contains might point to self, check in a separate step */ if ((bdev_free = !bdev->bd_holders)) bdev->bd_holder = NULL; if (!bdev->bd_contains->bd_holders) bdev->bd_contains->bd_holder = NULL; spin_unlock(&bdev_lock); /* * If this was the last claim, remove holder link and * unblock evpoll if it was a write holder. */ if (bdev_free && bdev->bd_write_holder) { disk_unblock_events(bdev->bd_disk); bdev->bd_write_holder = false; } } /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE * event. This is to ensure detection of media removal commanded * from userland - e.g. eject(1). */ disk_flush_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE); mutex_unlock(&bdev->bd_mutex); __blkdev_put(bdev, mode, 0); } EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have * to updated it before every ioctl. */ if (file->f_flags & O_NDELAY) mode |= FMODE_NDELAY; else mode &= ~FMODE_NDELAY; return blkdev_ioctl(bdev, mode, cmd, arg); } /* * Write data to the block device. Only intended for the block device itself * and the raw driver which basically is a fake block device. * * Does not take i_mutex for the write and thus is not for general purpose * use. */ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; size_t shorted = 0; ssize_t ret; if (bdev_read_only(I_BDEV(bd_inode))) return -EPERM; if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) return -ETXTBSY; if (!iov_iter_count(from)) return 0; if (iocb->ki_pos >= size) return -ENOSPC; if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; size -= iocb->ki_pos; if (iov_iter_count(from) > size) { shorted = iov_iter_count(from) - size; iov_iter_truncate(from, size); } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); blk_finish_plug(&plug); return ret; } EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; size_t shorted = 0; ssize_t ret; if (pos >= size) return 0; size -= pos; if (iov_iter_count(to) > size) { shorted = iov_iter_count(to) - size; iov_iter_truncate(to, size); } ret = generic_file_read_iter(iocb, to); iov_iter_reexpand(to, iov_iter_count(to) + shorted); return ret; } EXPORT_SYMBOL_GPL(blkdev_read_iter); /* * Try to release a page associated with block device when the system * is under memory pressure. */ static int blkdev_releasepage(struct page *page, gfp_t wait) { struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; if (super && super->s_op->bdev_try_to_free_page) return super->s_op->bdev_try_to_free_page(super, page, wait); return try_to_free_buffers(page); } static int blkdev_writepages(struct address_space *mapping, struct writeback_control *wbc) { return generic_writepages(mapping, wbc); } static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .readahead = blkdev_readahead, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .migratepage = buffer_migrate_page_norefs, .is_dirty_writeback = buffer_check_dirty_writeback, }; #define BLKDEV_FALLOC_FL_SUPPORTED \ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) static long blkdev_fallocate(struct file *file, int mode, loff_t start, loff_t len) { struct block_device *bdev = I_BDEV(bdev_file_inode(file)); loff_t end = start + len - 1; loff_t isize; int error; /* Fail if we don't recognize the flags. */ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) return -EOPNOTSUPP; /* Don't go off the end of the device. */ isize = i_size_read(bdev->bd_inode); if (start >= isize) return -EINVAL; if (end >= isize) { if (mode & FALLOC_FL_KEEP_SIZE) { len = isize - start; end = start + len - 1; } else return -EINVAL; } /* * Don't allow IO that isn't aligned to logical block size. */ if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; /* Invalidate the page cache, including dirty pages. */ error = truncate_bdev_range(bdev, file->f_mode, start, end); if (error) return error; switch (mode) { case FALLOC_FL_ZERO_RANGE: case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOUNMAP); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: error = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, BLKDEV_ZERO_NOFALLBACK); break; case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: error = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL, 0); break; default: return -EOPNOTSUPP; } if (error) return error; /* * Invalidate again; if someone wandered in and dirtied a page, * the caller will be given -EBUSY. The third argument is * inclusive, so the rounding here is safe. */ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); } const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, .iopoll = blkdev_iopoll, .mmap = generic_file_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, }; /** * lookup_bdev - lookup a struct block_device by name * @pathname: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ struct block_device *lookup_bdev(const char *pathname) { struct block_device *bdev; struct inode *inode; struct path path; int error; if (!pathname || !*pathname) return ERR_PTR(-EINVAL); error = kern_path(pathname, LOOKUP_FOLLOW, &path); if (error) return ERR_PTR(error); inode = d_backing_inode(path.dentry); error = -ENOTBLK; if (!S_ISBLK(inode->i_mode)) goto fail; error = -EACCES; if (!may_open_dev(&path)) goto fail; error = -ENOMEM; bdev = bd_acquire(inode); if (!bdev) goto fail; out: path_put(&path); return bdev; fail: bdev = ERR_PTR(error); goto out; } EXPORT_SYMBOL(lookup_bdev); int __invalidate_device(struct block_device *bdev, bool kill_dirty) { struct super_block *sb = get_super(bdev); int res = 0; if (sb) { /* * no need to lock the super, get_super holds the * read mutex so the filesystem cannot go away * under us (->put_super runs with the write lock * hold). */ shrink_dcache_sb(sb); res = invalidate_inodes(sb, kill_dirty); drop_super(sb); } invalidate_bdev(bdev); return res; } EXPORT_SYMBOL(__invalidate_device); void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; spin_lock(&inode->i_lock); if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || mapping->nrpages == 0) { spin_unlock(&inode->i_lock); continue; } __iget(inode); spin_unlock(&inode->i_lock); spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); old_inode = inode; bdev = I_BDEV(inode); mutex_lock(&bdev->bd_mutex); if (bdev->bd_openers) func(bdev, arg); mutex_unlock(&bdev->bd_mutex); spin_lock(&blockdev_superblock->s_inode_list_lock); } spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM 9p #if !defined(_TRACE_9P_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_9P_H #include <linux/tracepoint.h> #define P9_MSG_T \ EM( P9_TLERROR, "P9_TLERROR" ) \ EM( P9_RLERROR, "P9_RLERROR" ) \ EM( P9_TSTATFS, "P9_TSTATFS" ) \ EM( P9_RSTATFS, "P9_RSTATFS" ) \ EM( P9_TLOPEN, "P9_TLOPEN" ) \ EM( P9_RLOPEN, "P9_RLOPEN" ) \ EM( P9_TLCREATE, "P9_TLCREATE" ) \ EM( P9_RLCREATE, "P9_RLCREATE" ) \ EM( P9_TSYMLINK, "P9_TSYMLINK" ) \ EM( P9_RSYMLINK, "P9_RSYMLINK" ) \ EM( P9_TMKNOD, "P9_TMKNOD" ) \ EM( P9_RMKNOD, "P9_RMKNOD" ) \ EM( P9_TRENAME, "P9_TRENAME" ) \ EM( P9_RRENAME, "P9_RRENAME" ) \ EM( P9_TREADLINK, "P9_TREADLINK" ) \ EM( P9_RREADLINK, "P9_RREADLINK" ) \ EM( P9_TGETATTR, "P9_TGETATTR" ) \ EM( P9_RGETATTR, "P9_RGETATTR" ) \ EM( P9_TSETATTR, "P9_TSETATTR" ) \ EM( P9_RSETATTR, "P9_RSETATTR" ) \ EM( P9_TXATTRWALK, "P9_TXATTRWALK" ) \ EM( P9_RXATTRWALK, "P9_RXATTRWALK" ) \ EM( P9_TXATTRCREATE, "P9_TXATTRCREATE" ) \ EM( P9_RXATTRCREATE, "P9_RXATTRCREATE" ) \ EM( P9_TREADDIR, "P9_TREADDIR" ) \ EM( P9_RREADDIR, "P9_RREADDIR" ) \ EM( P9_TFSYNC, "P9_TFSYNC" ) \ EM( P9_RFSYNC, "P9_RFSYNC" ) \ EM( P9_TLOCK, "P9_TLOCK" ) \ EM( P9_RLOCK, "P9_RLOCK" ) \ EM( P9_TGETLOCK, "P9_TGETLOCK" ) \ EM( P9_RGETLOCK, "P9_RGETLOCK" ) \ EM( P9_TLINK, "P9_TLINK" ) \ EM( P9_RLINK, "P9_RLINK" ) \ EM( P9_TMKDIR, "P9_TMKDIR" ) \ EM( P9_RMKDIR, "P9_RMKDIR" ) \ EM( P9_TRENAMEAT, "P9_TRENAMEAT" ) \ EM( P9_RRENAMEAT, "P9_RRENAMEAT" ) \ EM( P9_TUNLINKAT, "P9_TUNLINKAT" ) \ EM( P9_RUNLINKAT, "P9_RUNLINKAT" ) \ EM( P9_TVERSION, "P9_TVERSION" ) \ EM( P9_RVERSION, "P9_RVERSION" ) \ EM( P9_TAUTH, "P9_TAUTH" ) \ EM( P9_RAUTH, "P9_RAUTH" ) \ EM( P9_TATTACH, "P9_TATTACH" ) \ EM( P9_RATTACH, "P9_RATTACH" ) \ EM( P9_TERROR, "P9_TERROR" ) \ EM( P9_RERROR, "P9_RERROR" ) \ EM( P9_TFLUSH, "P9_TFLUSH" ) \ EM( P9_RFLUSH, "P9_RFLUSH" ) \ EM( P9_TWALK, "P9_TWALK" ) \ EM( P9_RWALK, "P9_RWALK" ) \ EM( P9_TOPEN, "P9_TOPEN" ) \ EM( P9_ROPEN, "P9_ROPEN" ) \ EM( P9_TCREATE, "P9_TCREATE" ) \ EM( P9_RCREATE, "P9_RCREATE" ) \ EM( P9_TREAD, "P9_TREAD" ) \ EM( P9_RREAD, "P9_RREAD" ) \ EM( P9_TWRITE, "P9_TWRITE" ) \ EM( P9_RWRITE, "P9_RWRITE" ) \ EM( P9_TCLUNK, "P9_TCLUNK" ) \ EM( P9_RCLUNK, "P9_RCLUNK" ) \ EM( P9_TREMOVE, "P9_TREMOVE" ) \ EM( P9_RREMOVE, "P9_RREMOVE" ) \ EM( P9_TSTAT, "P9_TSTAT" ) \ EM( P9_RSTAT, "P9_RSTAT" ) \ EM( P9_TWSTAT, "P9_TWSTAT" ) \ EMe(P9_RWSTAT, "P9_RWSTAT" ) /* Define EM() to export the enums to userspace via TRACE_DEFINE_ENUM() */ #undef EM #undef EMe #define EM(a, b) TRACE_DEFINE_ENUM(a); #define EMe(a, b) TRACE_DEFINE_ENUM(a); P9_MSG_T /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a, b) { a, b }, #define EMe(a, b) { a, b } #define show_9p_op(type) \ __print_symbolic(type, P9_MSG_T) TRACE_EVENT(9p_client_req, TP_PROTO(struct p9_client *clnt, int8_t type, int tag), TP_ARGS(clnt, type, tag), TP_STRUCT__entry( __field( void *, clnt ) __field( __u8, type ) __field( __u32, tag ) ), TP_fast_assign( __entry->clnt = clnt; __entry->type = type; __entry->tag = tag; ), TP_printk("client %lu request %s tag %d", (long)__entry->clnt, show_9p_op(__entry->type), __entry->tag) ); TRACE_EVENT(9p_client_res, TP_PROTO(struct p9_client *clnt, int8_t type, int tag, int err), TP_ARGS(clnt, type, tag, err), TP_STRUCT__entry( __field( void *, clnt ) __field( __u8, type ) __field( __u32, tag ) __field( __u32, err ) ), TP_fast_assign( __entry->clnt = clnt; __entry->type = type; __entry->tag = tag; __entry->err = err; ), TP_printk("client %lu response %s tag %d err %d", (long)__entry->clnt, show_9p_op(__entry->type), __entry->tag, __entry->err) ); /* dump 32 bytes of protocol data */ #define P9_PROTO_DUMP_SZ 32 TRACE_EVENT(9p_protocol_dump, TP_PROTO(struct p9_client *clnt, struct p9_fcall *pdu), TP_ARGS(clnt, pdu), TP_STRUCT__entry( __field( void *, clnt ) __field( __u8, type ) __field( __u16, tag ) __array( unsigned char, line, P9_PROTO_DUMP_SZ ) ), TP_fast_assign( __entry->clnt = clnt; __entry->type = pdu->id; __entry->tag = pdu->tag; memcpy(__entry->line, pdu->sdata, P9_PROTO_DUMP_SZ); ), TP_printk("clnt %lu %s(tag = %d)\n%.3x: %16ph\n%.3x: %16ph\n", (unsigned long)__entry->clnt, show_9p_op(__entry->type), __entry->tag, 0, __entry->line, 16, __entry->line + 16) ); #endif /* _TRACE_9P_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM net #if !defined(_TRACE_NET_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_NET_H #include <linux/skbuff.h> #include <linux/netdevice.h> #include <linux/if_vlan.h> #include <linux/ip.h> #include <linux/tracepoint.h> TRACE_EVENT(net_dev_start_xmit, TP_PROTO(const struct sk_buff *skb, const struct net_device *dev), TP_ARGS(skb, dev), TP_STRUCT__entry( __string( name, dev->name ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( int, network_offset ) __field( bool, transport_offset_valid) __field( int, transport_offset) __field( u8, tx_flags ) __field( u16, gso_size ) __field( u16, gso_segs ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, dev->name); __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->network_offset = skb_network_offset(skb); __entry->transport_offset_valid = skb_transport_header_was_set(skb); __entry->transport_offset = skb_transport_offset(skb); __entry->tx_flags = skb_shinfo(skb)->tx_flags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_segs = skb_shinfo(skb)->gso_segs; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x", __get_str(name), __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->len, __entry->data_len, __entry->network_offset, __entry->transport_offset_valid, __entry->transport_offset, __entry->tx_flags, __entry->gso_size, __entry->gso_segs, __entry->gso_type) ); TRACE_EVENT(net_dev_xmit, TP_PROTO(struct sk_buff *skb, int rc, struct net_device *dev, unsigned int skb_len), TP_ARGS(skb, rc, dev, skb_len), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __field( int, rc ) __string( name, dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb_len; __entry->rc = rc; __assign_str(name, dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u rc=%d", __get_str(name), __entry->skbaddr, __entry->len, __entry->rc) ); TRACE_EVENT(net_dev_xmit_timeout, TP_PROTO(struct net_device *dev, int queue_index), TP_ARGS(dev, queue_index), TP_STRUCT__entry( __string( name, dev->name ) __string( driver, netdev_drivername(dev)) __field( int, queue_index ) ), TP_fast_assign( __assign_str(name, dev->name); __assign_str(driver, netdev_drivername(dev)); __entry->queue_index = queue_index; ), TP_printk("dev=%s driver=%s queue=%d", __get_str(name), __get_str(driver), __entry->queue_index) ); DECLARE_EVENT_CLASS(net_dev_template, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __field( void *, skbaddr ) __field( unsigned int, len ) __string( name, skb->dev->name ) ), TP_fast_assign( __entry->skbaddr = skb; __entry->len = skb->len; __assign_str(name, skb->dev->name); ), TP_printk("dev=%s skbaddr=%p len=%u", __get_str(name), __entry->skbaddr, __entry->len) ) DEFINE_EVENT(net_dev_template, net_dev_queue, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_receive_skb, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_template, netif_rx, TP_PROTO(struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_verbose_template, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb), TP_STRUCT__entry( __string( name, skb->dev->name ) __field( unsigned int, napi_id ) __field( u16, queue_mapping ) __field( const void *, skbaddr ) __field( bool, vlan_tagged ) __field( u16, vlan_proto ) __field( u16, vlan_tci ) __field( u16, protocol ) __field( u8, ip_summed ) __field( u32, hash ) __field( bool, l4_hash ) __field( unsigned int, len ) __field( unsigned int, data_len ) __field( unsigned int, truesize ) __field( bool, mac_header_valid) __field( int, mac_header ) __field( unsigned char, nr_frags ) __field( u16, gso_size ) __field( u16, gso_type ) ), TP_fast_assign( __assign_str(name, skb->dev->name); #ifdef CONFIG_NET_RX_BUSY_POLL __entry->napi_id = skb->napi_id; #else __entry->napi_id = 0; #endif __entry->queue_mapping = skb->queue_mapping; __entry->skbaddr = skb; __entry->vlan_tagged = skb_vlan_tag_present(skb); __entry->vlan_proto = ntohs(skb->vlan_proto); __entry->vlan_tci = skb_vlan_tag_get(skb); __entry->protocol = ntohs(skb->protocol); __entry->ip_summed = skb->ip_summed; __entry->hash = skb->hash; __entry->l4_hash = skb->l4_hash; __entry->len = skb->len; __entry->data_len = skb->data_len; __entry->truesize = skb->truesize; __entry->mac_header_valid = skb_mac_header_was_set(skb); __entry->mac_header = skb_mac_header(skb) - skb->data; __entry->nr_frags = skb_shinfo(skb)->nr_frags; __entry->gso_size = skb_shinfo(skb)->gso_size; __entry->gso_type = skb_shinfo(skb)->gso_type; ), TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x", __get_str(name), __entry->napi_id, __entry->queue_mapping, __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci, __entry->protocol, __entry->ip_summed, __entry->hash, __entry->l4_hash, __entry->len, __entry->data_len, __entry->truesize, __entry->mac_header_valid, __entry->mac_header, __entry->nr_frags, __entry->gso_size, __entry->gso_type) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_frags_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, napi_gro_receive_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_receive_skb_list_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DEFINE_EVENT(net_dev_rx_verbose_template, netif_rx_ni_entry, TP_PROTO(const struct sk_buff *skb), TP_ARGS(skb) ); DECLARE_EVENT_CLASS(net_dev_rx_exit_template, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field(int, ret) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_frags_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, napi_gro_receive_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_rx_ni_exit, TP_PROTO(int ret), TP_ARGS(ret) ); DEFINE_EVENT(net_dev_rx_exit_template, netif_receive_skb_list_exit, TP_PROTO(int ret), TP_ARGS(ret) ); #endif /* _TRACE_NET_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cfg80211 #if !defined(__RDEV_OPS_TRACE) || defined(TRACE_HEADER_MULTI_READ) #define __RDEV_OPS_TRACE #include <linux/tracepoint.h> #include <linux/rtnetlink.h> #include <linux/etherdevice.h> #include <net/cfg80211.h> #include "core.h" #define MAC_ENTRY(entry_mac) __array(u8, entry_mac, ETH_ALEN) #define MAC_ASSIGN(entry_mac, given_mac) do { \ if (given_mac) \ memcpy(__entry->entry_mac, given_mac, ETH_ALEN); \ else \ eth_zero_addr(__entry->entry_mac); \ } while (0) #define MAC_PR_FMT "%pM" #define MAC_PR_ARG(entry_mac) (__entry->entry_mac) #define MAXNAME 32 #define WIPHY_ENTRY __array(char, wiphy_name, 32) #define WIPHY_ASSIGN strlcpy(__entry->wiphy_name, wiphy_name(wiphy), MAXNAME) #define WIPHY_PR_FMT "%s" #define WIPHY_PR_ARG __entry->wiphy_name #define WDEV_ENTRY __field(u32, id) #define WDEV_ASSIGN (__entry->id) = (!IS_ERR_OR_NULL(wdev) \ ? wdev->identifier : 0) #define WDEV_PR_FMT "wdev(%u)" #define WDEV_PR_ARG (__entry->id) #define NETDEV_ENTRY __array(char, name, IFNAMSIZ) \ __field(int, ifindex) #define NETDEV_ASSIGN \ do { \ memcpy(__entry->name, netdev->name, IFNAMSIZ); \ (__entry->ifindex) = (netdev->ifindex); \ } while (0) #define NETDEV_PR_FMT "netdev:%s(%d)" #define NETDEV_PR_ARG __entry->name, __entry->ifindex #define MESH_CFG_ENTRY __field(u16, dot11MeshRetryTimeout) \ __field(u16, dot11MeshConfirmTimeout) \ __field(u16, dot11MeshHoldingTimeout) \ __field(u16, dot11MeshMaxPeerLinks) \ __field(u8, dot11MeshMaxRetries) \ __field(u8, dot11MeshTTL) \ __field(u8, element_ttl) \ __field(bool, auto_open_plinks) \ __field(u32, dot11MeshNbrOffsetMaxNeighbor) \ __field(u8, dot11MeshHWMPmaxPREQretries) \ __field(u32, path_refresh_time) \ __field(u32, dot11MeshHWMPactivePathTimeout) \ __field(u16, min_discovery_timeout) \ __field(u16, dot11MeshHWMPpreqMinInterval) \ __field(u16, dot11MeshHWMPperrMinInterval) \ __field(u16, dot11MeshHWMPnetDiameterTraversalTime) \ __field(u8, dot11MeshHWMPRootMode) \ __field(u16, dot11MeshHWMPRannInterval) \ __field(bool, dot11MeshGateAnnouncementProtocol) \ __field(bool, dot11MeshForwarding) \ __field(s32, rssi_threshold) \ __field(u16, ht_opmode) \ __field(u32, dot11MeshHWMPactivePathToRootTimeout) \ __field(u16, dot11MeshHWMProotInterval) \ __field(u16, dot11MeshHWMPconfirmationInterval) \ __field(bool, dot11MeshNolearn) #define MESH_CFG_ASSIGN \ do { \ __entry->dot11MeshRetryTimeout = conf->dot11MeshRetryTimeout; \ __entry->dot11MeshConfirmTimeout = \ conf->dot11MeshConfirmTimeout; \ __entry->dot11MeshHoldingTimeout = \ conf->dot11MeshHoldingTimeout; \ __entry->dot11MeshMaxPeerLinks = conf->dot11MeshMaxPeerLinks; \ __entry->dot11MeshMaxRetries = conf->dot11MeshMaxRetries; \ __entry->dot11MeshTTL = conf->dot11MeshTTL; \ __entry->element_ttl = conf->element_ttl; \ __entry->auto_open_plinks = conf->auto_open_plinks; \ __entry->dot11MeshNbrOffsetMaxNeighbor = \ conf->dot11MeshNbrOffsetMaxNeighbor; \ __entry->dot11MeshHWMPmaxPREQretries = \ conf->dot11MeshHWMPmaxPREQretries; \ __entry->path_refresh_time = conf->path_refresh_time; \ __entry->dot11MeshHWMPactivePathTimeout = \ conf->dot11MeshHWMPactivePathTimeout; \ __entry->min_discovery_timeout = conf->min_discovery_timeout; \ __entry->dot11MeshHWMPpreqMinInterval = \ conf->dot11MeshHWMPpreqMinInterval; \ __entry->dot11MeshHWMPperrMinInterval = \ conf->dot11MeshHWMPperrMinInterval; \ __entry->dot11MeshHWMPnetDiameterTraversalTime = \ conf->dot11MeshHWMPnetDiameterTraversalTime; \ __entry->dot11MeshHWMPRootMode = conf->dot11MeshHWMPRootMode; \ __entry->dot11MeshHWMPRannInterval = \ conf->dot11MeshHWMPRannInterval; \ __entry->dot11MeshGateAnnouncementProtocol = \ conf->dot11MeshGateAnnouncementProtocol; \ __entry->dot11MeshForwarding = conf->dot11MeshForwarding; \ __entry->rssi_threshold = conf->rssi_threshold; \ __entry->ht_opmode = conf->ht_opmode; \ __entry->dot11MeshHWMPactivePathToRootTimeout = \ conf->dot11MeshHWMPactivePathToRootTimeout; \ __entry->dot11MeshHWMProotInterval = \ conf->dot11MeshHWMProotInterval; \ __entry->dot11MeshHWMPconfirmationInterval = \ conf->dot11MeshHWMPconfirmationInterval; \ __entry->dot11MeshNolearn = conf->dot11MeshNolearn; \ } while (0) #define CHAN_ENTRY __field(enum nl80211_band, band) \ __field(u32, center_freq) \ __field(u16, freq_offset) #define CHAN_ASSIGN(chan) \ do { \ if (chan) { \ __entry->band = chan->band; \ __entry->center_freq = chan->center_freq; \ __entry->freq_offset = chan->freq_offset; \ } else { \ __entry->band = 0; \ __entry->center_freq = 0; \ __entry->freq_offset = 0; \ } \ } while (0) #define CHAN_PR_FMT "band: %d, freq: %u.%03u" #define CHAN_PR_ARG __entry->band, __entry->center_freq, __entry->freq_offset #define CHAN_DEF_ENTRY __field(enum nl80211_band, band) \ __field(u32, control_freq) \ __field(u32, freq_offset) \ __field(u32, width) \ __field(u32, center_freq1) \ __field(u32, freq1_offset) \ __field(u32, center_freq2) #define CHAN_DEF_ASSIGN(chandef) \ do { \ if ((chandef) && (chandef)->chan) { \ __entry->band = (chandef)->chan->band; \ __entry->control_freq = \ (chandef)->chan->center_freq; \ __entry->freq_offset = \ (chandef)->chan->freq_offset; \ __entry->width = (chandef)->width; \ __entry->center_freq1 = (chandef)->center_freq1;\ __entry->freq1_offset = (chandef)->freq1_offset;\ __entry->center_freq2 = (chandef)->center_freq2;\ } else { \ __entry->band = 0; \ __entry->control_freq = 0; \ __entry->freq_offset = 0; \ __entry->width = 0; \ __entry->center_freq1 = 0; \ __entry->freq1_offset = 0; \ __entry->center_freq2 = 0; \ } \ } while (0) #define CHAN_DEF_PR_FMT \ "band: %d, control freq: %u.%03u, width: %d, cf1: %u.%03u, cf2: %u" #define CHAN_DEF_PR_ARG __entry->band, __entry->control_freq, \ __entry->freq_offset, __entry->width, \ __entry->center_freq1, __entry->freq1_offset, \ __entry->center_freq2 #define SINFO_ENTRY __field(int, generation) \ __field(u32, connected_time) \ __field(u32, inactive_time) \ __field(u32, rx_bytes) \ __field(u32, tx_bytes) \ __field(u32, rx_packets) \ __field(u32, tx_packets) \ __field(u32, tx_retries) \ __field(u32, tx_failed) \ __field(u32, rx_dropped_misc) \ __field(u32, beacon_loss_count) \ __field(u16, llid) \ __field(u16, plid) \ __field(u8, plink_state) #define SINFO_ASSIGN \ do { \ __entry->generation = sinfo->generation; \ __entry->connected_time = sinfo->connected_time; \ __entry->inactive_time = sinfo->inactive_time; \ __entry->rx_bytes = sinfo->rx_bytes; \ __entry->tx_bytes = sinfo->tx_bytes; \ __entry->rx_packets = sinfo->rx_packets; \ __entry->tx_packets = sinfo->tx_packets; \ __entry->tx_retries = sinfo->tx_retries; \ __entry->tx_failed = sinfo->tx_failed; \ __entry->rx_dropped_misc = sinfo->rx_dropped_misc; \ __entry->beacon_loss_count = sinfo->beacon_loss_count; \ __entry->llid = sinfo->llid; \ __entry->plid = sinfo->plid; \ __entry->plink_state = sinfo->plink_state; \ } while (0) #define BOOL_TO_STR(bo) (bo) ? "true" : "false" #define QOS_MAP_ENTRY __field(u8, num_des) \ __array(u8, dscp_exception, \ 2 * IEEE80211_QOS_MAP_MAX_EX) \ __array(u8, up, IEEE80211_QOS_MAP_LEN_MIN) #define QOS_MAP_ASSIGN(qos_map) \ do { \ if ((qos_map)) { \ __entry->num_des = (qos_map)->num_des; \ memcpy(__entry->dscp_exception, \ &(qos_map)->dscp_exception, \ 2 * IEEE80211_QOS_MAP_MAX_EX); \ memcpy(__entry->up, &(qos_map)->up, \ IEEE80211_QOS_MAP_LEN_MIN); \ } else { \ __entry->num_des = 0; \ memset(__entry->dscp_exception, 0, \ 2 * IEEE80211_QOS_MAP_MAX_EX); \ memset(__entry->up, 0, \ IEEE80211_QOS_MAP_LEN_MIN); \ } \ } while (0) /************************************************************* * rdev->ops traces * *************************************************************/ TRACE_EVENT(rdev_suspend, TP_PROTO(struct wiphy *wiphy, struct cfg80211_wowlan *wow), TP_ARGS(wiphy, wow), TP_STRUCT__entry( WIPHY_ENTRY __field(bool, any) __field(bool, disconnect) __field(bool, magic_pkt) __field(bool, gtk_rekey_failure) __field(bool, eap_identity_req) __field(bool, four_way_handshake) __field(bool, rfkill_release) __field(bool, valid_wow) ), TP_fast_assign( WIPHY_ASSIGN; if (wow) { __entry->any = wow->any; __entry->disconnect = wow->disconnect; __entry->magic_pkt = wow->magic_pkt; __entry->gtk_rekey_failure = wow->gtk_rekey_failure; __entry->eap_identity_req = wow->eap_identity_req; __entry->four_way_handshake = wow->four_way_handshake; __entry->rfkill_release = wow->rfkill_release; __entry->valid_wow = true; } else { __entry->valid_wow = false; } ), TP_printk(WIPHY_PR_FMT ", wow%s - any: %d, disconnect: %d, " "magic pkt: %d, gtk rekey failure: %d, eap identify req: %d, " "four way handshake: %d, rfkill release: %d.", WIPHY_PR_ARG, __entry->valid_wow ? "" : "(Not configured!)", __entry->any, __entry->disconnect, __entry->magic_pkt, __entry->gtk_rekey_failure, __entry->eap_identity_req, __entry->four_way_handshake, __entry->rfkill_release) ); TRACE_EVENT(rdev_return_int, TP_PROTO(struct wiphy *wiphy, int ret), TP_ARGS(wiphy, ret), TP_STRUCT__entry( WIPHY_ENTRY __field(int, ret) ), TP_fast_assign( WIPHY_ASSIGN; __entry->ret = ret; ), TP_printk(WIPHY_PR_FMT ", returned: %d", WIPHY_PR_ARG, __entry->ret) ); TRACE_EVENT(rdev_scan, TP_PROTO(struct wiphy *wiphy, struct cfg80211_scan_request *request), TP_ARGS(wiphy, request), TP_STRUCT__entry( WIPHY_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; ), TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) ); DECLARE_EVENT_CLASS(wiphy_only_evt, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy), TP_STRUCT__entry( WIPHY_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; ), TP_printk(WIPHY_PR_FMT, WIPHY_PR_ARG) ); DEFINE_EVENT(wiphy_only_evt, rdev_resume, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) ); DEFINE_EVENT(wiphy_only_evt, rdev_return_void, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) ); DEFINE_EVENT(wiphy_only_evt, rdev_get_antenna, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) ); DEFINE_EVENT(wiphy_only_evt, rdev_rfkill_poll, TP_PROTO(struct wiphy *wiphy), TP_ARGS(wiphy) ); DECLARE_EVENT_CLASS(wiphy_enabled_evt, TP_PROTO(struct wiphy *wiphy, bool enabled), TP_ARGS(wiphy, enabled), TP_STRUCT__entry( WIPHY_ENTRY __field(bool, enabled) ), TP_fast_assign( WIPHY_ASSIGN; __entry->enabled = enabled; ), TP_printk(WIPHY_PR_FMT ", %senabled ", WIPHY_PR_ARG, __entry->enabled ? "" : "not ") ); DEFINE_EVENT(wiphy_enabled_evt, rdev_set_wakeup, TP_PROTO(struct wiphy *wiphy, bool enabled), TP_ARGS(wiphy, enabled) ); TRACE_EVENT(rdev_add_virtual_intf, TP_PROTO(struct wiphy *wiphy, char *name, enum nl80211_iftype type), TP_ARGS(wiphy, name, type), TP_STRUCT__entry( WIPHY_ENTRY __string(vir_intf_name, name ? name : "<noname>") __field(enum nl80211_iftype, type) ), TP_fast_assign( WIPHY_ASSIGN; __assign_str(vir_intf_name, name ? name : "<noname>"); __entry->type = type; ), TP_printk(WIPHY_PR_FMT ", virtual intf name: %s, type: %d", WIPHY_PR_ARG, __get_str(vir_intf_name), __entry->type) ); DECLARE_EVENT_CLASS(wiphy_wdev_evt, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT, WIPHY_PR_ARG, WDEV_PR_ARG) ); DECLARE_EVENT_CLASS(wiphy_wdev_cookie_evt, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev, u64 cookie), TP_ARGS(wiphy, wdev, cookie), TP_STRUCT__entry( WIPHY_ENTRY WDEV_ENTRY __field(u64, cookie) ), TP_fast_assign( WIPHY_ASSIGN; WDEV_ASSIGN; __entry->cookie = cookie; ), TP_printk(WIPHY_PR_FMT ", " WDEV_PR_FMT ", cookie: %lld", WIPHY_PR_ARG, WDEV_PR_ARG, (unsigned long long)__entry->cookie) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_return_wdev, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); DEFINE_EVENT(wiphy_wdev_evt, rdev_del_virtual_intf, TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev), TP_ARGS(wiphy, wdev) ); TRACE_EVENT(rdev_change_virtual_intf, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, enum nl80211_iftype type), TP_ARGS(wiphy, netdev, type), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(enum nl80211_iftype, type) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->type = type; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", type: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->type) ); DECLARE_EVENT_CLASS(key_handle, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(mac_addr) __field(u8, key_index) __field(bool, pairwise) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); __entry->key_index = key_index; __entry->pairwise = pairwise; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, pairwise: %s, mac addr: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr)) ); DEFINE_EVENT(key_handle, rdev_get_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) ); DEFINE_EVENT(key_handle, rdev_del_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr) ); TRACE_EVENT(rdev_add_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool pairwise, const u8 *mac_addr, u8 mode), TP_ARGS(wiphy, netdev, key_index, pairwise, mac_addr, mode), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(mac_addr) __field(u8, key_index) __field(bool, pairwise) __field(u8, mode) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(mac_addr, mac_addr); __entry->key_index = key_index; __entry->pairwise = pairwise; __entry->mode = mode; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key_index: %u, " "mode: %u, pairwise: %s, mac addr: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, __entry->mode, BOOL_TO_STR(__entry->pairwise), MAC_PR_ARG(mac_addr)) ); TRACE_EVENT(rdev_set_default_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index, bool unicast, bool multicast), TP_ARGS(wiphy, netdev, key_index, unicast, multicast), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u8, key_index) __field(bool, unicast) __field(bool, multicast) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->key_index = key_index; __entry->unicast = unicast; __entry->multicast = multicast; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u, unicast: %s, multicast: %s", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index, BOOL_TO_STR(__entry->unicast), BOOL_TO_STR(__entry->multicast)) ); TRACE_EVENT(rdev_set_default_mgmt_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index), TP_ARGS(wiphy, netdev, key_index), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->key_index = key_index; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index) ); TRACE_EVENT(rdev_set_default_beacon_key, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 key_index), TP_ARGS(wiphy, netdev, key_index), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __field(u8, key_index) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; __entry->key_index = key_index; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", key index: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->key_index) ); TRACE_EVENT(rdev_start_ap, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_ap_settings *settings), TP_ARGS(wiphy, netdev, settings), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY CHAN_DEF_ENTRY __field(int, beacon_interval) __field(int, dtim_period) __array(char, ssid, IEEE80211_MAX_SSID_LEN + 1) __field(enum nl80211_hidden_ssid, hidden_ssid) __field(u32, wpa_ver) __field(bool, privacy) __field(enum nl80211_auth_type, auth_type) __field(int, inactivity_timeout) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; CHAN_DEF_ASSIGN(&settings->chandef); __entry->beacon_interval = settings->beacon_interval; __entry->dtim_period = settings->dtim_period; __entry->hidden_ssid = settings->hidden_ssid; __entry->wpa_ver = settings->crypto.wpa_versions; __entry->privacy = settings->privacy; __entry->auth_type = settings->auth_type; __entry->inactivity_timeout = settings->inactivity_timeout; memset(__entry->ssid, 0, IEEE80211_MAX_SSID_LEN + 1); memcpy(__entry->ssid, settings->ssid, settings->ssid_len); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", AP settings - ssid: %s, " CHAN_DEF_PR_FMT ", beacon interval: %d, dtim period: %d, " "hidden ssid: %d, wpa versions: %u, privacy: %s, " "auth type: %d, inactivity timeout: %d", WIPHY_PR_ARG, NETDEV_PR_ARG, __entry->ssid, CHAN_DEF_PR_ARG, __entry->beacon_interval, __entry->dtim_period, __entry->hidden_ssid, __entry->wpa_ver, BOOL_TO_STR(__entry->privacy), __entry->auth_type, __entry->inactivity_timeout) ); TRACE_EVENT(rdev_change_beacon, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct cfg80211_beacon_data *info), TP_ARGS(wiphy, netdev, info), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY __dynamic_array(u8, head, info ? info->head_len : 0) __dynamic_array(u8, tail, info ? info->tail_len : 0) __dynamic_array(u8, beacon_ies, info ? info->beacon_ies_len : 0) __dynamic_array(u8, proberesp_ies, info ? info->proberesp_ies_len : 0) __dynamic_array(u8, assocresp_ies, info ? info->assocresp_ies_len : 0) __dynamic_array(u8, probe_resp, info ? info->probe_resp_len : 0) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; if (info) { if (info->head) memcpy(__get_dynamic_array(head), info->head, info->head_len); if (info->tail) memcpy(__get_dynamic_array(tail), info->tail, info->tail_len); if (info->beacon_ies) memcpy(__get_dynamic_array(beacon_ies), info->beacon_ies, info->beacon_ies_len); if (info->proberesp_ies) memcpy(__get_dynamic_array(proberesp_ies), info->proberesp_ies, info->proberesp_ies_len); if (info->assocresp_ies) memcpy(__get_dynamic_array(assocresp_ies), info->assocresp_ies, info->assocresp_ies_len); if (info->probe_resp) memcpy(__get_dynamic_array(probe_resp), info->probe_resp, info->probe_resp_len); } ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) ); DECLARE_EVENT_CLASS(wiphy_netdev_evt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_stop_ap, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_set_rekey_data, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_get_mesh_config, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_mesh, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ibss, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_leave_ocb, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_flush_pmksa, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DEFINE_EVENT(wiphy_netdev_evt, rdev_end_cac, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev), TP_ARGS(wiphy, netdev) ); DECLARE_EVENT_CLASS(station_add_change, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), TP_ARGS(wiphy, netdev, mac, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) __field(u32, sta_flags_mask) __field(u32, sta_flags_set) __field(u32, sta_modify_mask) __field(int, listen_interval) __field(u16, capability) __field(u16, aid) __field(u8, plink_action) __field(u8, plink_state) __field(u8, uapsd_queues) __field(u8, max_sp) __field(u8, opmode_notif) __field(bool, opmode_notif_used) __array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap)) __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap)) __array(char, vlan, IFNAMSIZ) __dynamic_array(u8, supported_rates, params->supported_rates_len) __dynamic_array(u8, ext_capab, params->ext_capab_len) __dynamic_array(u8, supported_channels, params->supported_channels_len) __dynamic_array(u8, supported_oper_classes, params->supported_oper_classes_len) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); __entry->sta_flags_mask = params->sta_flags_mask; __entry->sta_flags_set = params->sta_flags_set; __entry->sta_modify_mask = params->sta_modify_mask; __entry->listen_interval = params->listen_interval; __entry->aid = params->aid; __entry->plink_action = params->plink_action; __entry->plink_state = params->plink_state; __entry->uapsd_queues = params->uapsd_queues; memset(__entry->ht_capa, 0, sizeof(struct ieee80211_ht_cap)); if (params->ht_capa) memcpy(__entry->ht_capa, params->ht_capa, sizeof(struct ieee80211_ht_cap)); memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap)); if (params->vht_capa) memcpy(__entry->vht_capa, params->vht_capa, sizeof(struct ieee80211_vht_cap)); memset(__entry->vlan, 0, sizeof(__entry->vlan)); if (params->vlan) memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ); if (params->supported_rates && params->supported_rates_len) memcpy(__get_dynamic_array(supported_rates), params->supported_rates, params->supported_rates_len); if (params->ext_capab && params->ext_capab_len) memcpy(__get_dynamic_array(ext_capab), params->ext_capab, params->ext_capab_len); if (params->supported_channels && params->supported_channels_len) memcpy(__get_dynamic_array(supported_channels), params->supported_channels, params->supported_channels_len); if (params->supported_oper_classes && params->supported_oper_classes_len) memcpy(__get_dynamic_array(supported_oper_classes), params->supported_oper_classes, params->supported_oper_classes_len); __entry->max_sp = params->max_sp; __entry->capability = params->capability; __entry->opmode_notif = params->opmode_notif; __entry->opmode_notif_used = params->opmode_notif_used; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", station flags mask: %u, station flags set: %u, " "station modify mask: %u, listen interval: %d, aid: %u, " "plink action: %u, plink state: %u, uapsd queues: %u, vlan:%s", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), __entry->sta_flags_mask, __entry->sta_flags_set, __entry->sta_modify_mask, __entry->listen_interval, __entry->aid, __entry->plink_action, __entry->plink_state, __entry->uapsd_queues, __entry->vlan) ); DEFINE_EVENT(station_add_change, rdev_add_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), TP_ARGS(wiphy, netdev, mac, params) ); DEFINE_EVENT(station_add_change, rdev_change_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, u8 *mac, struct station_parameters *params), TP_ARGS(wiphy, netdev, mac, params) ); DECLARE_EVENT_CLASS(wiphy_netdev_mac_evt, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, const u8 *mac), TP_ARGS(wiphy, netdev, mac), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, mac); ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", mac: " MAC_PR_FMT, WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac)) ); DECLARE_EVENT_CLASS(station_del, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct station_del_parameters *params), TP_ARGS(wiphy, netdev, params), TP_STRUCT__entry( WIPHY_ENTRY NETDEV_ENTRY MAC_ENTRY(sta_mac) __field(u8, subtype) __field(u16, reason_code) ), TP_fast_assign( WIPHY_ASSIGN; NETDEV_ASSIGN; MAC_ASSIGN(sta_mac, params->mac); __entry->subtype = params->subtype; __entry->reason_code = params->reason_code; ), TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT ", subtype: %u, reason_code: %u", WIPHY_PR_ARG, NETDEV_PR_ARG, MAC_PR_ARG(sta_mac), __entry->subtype, __entry->reason_code) ); DEFINE_EVENT(station_del, rdev_del_station, TP_PROTO(struct wiphy *wiphy, struct net_device *netdev, struct station_del_parameters *params), TP_ARGS(wiphy, netdev