1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the Forwarding Information Base. * * Authors: A.N.Kuznetsov, <kuznet@ms2.inr.ac.ru> */ #ifndef _NET_IP_FIB_H #define _NET_IP_FIB_H #include <net/flow.h> #include <linux/seq_file.h> #include <linux/rcupdate.h> #include <net/fib_notifier.h> #include <net/fib_rules.h> #include <net/inetpeer.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/refcount.h> struct fib_config { u8 fc_dst_len; u8 fc_tos; u8 fc_protocol; u8 fc_scope; u8 fc_type; u8 fc_gw_family; /* 2 bytes unused */ u32 fc_table; __be32 fc_dst; union { __be32 fc_gw4; struct in6_addr fc_gw6; }; int fc_oif; u32 fc_flags; u32 fc_priority; __be32 fc_prefsrc; u32 fc_nh_id; struct nlattr *fc_mx; struct rtnexthop *fc_mp; int fc_mx_len; int fc_mp_len; u32 fc_flow; u32 fc_nlflags; struct nl_info fc_nlinfo; struct nlattr *fc_encap; u16 fc_encap_type; }; struct fib_info; struct rtable; struct fib_nh_exception { struct fib_nh_exception __rcu *fnhe_next; int fnhe_genid; __be32 fnhe_daddr; u32 fnhe_pmtu; bool fnhe_mtu_locked; __be32 fnhe_gw; unsigned long fnhe_expires; struct rtable __rcu *fnhe_rth_input; struct rtable __rcu *fnhe_rth_output; unsigned long fnhe_stamp; struct rcu_head rcu; }; struct fnhe_hash_bucket { struct fib_nh_exception __rcu *chain; }; #define FNHE_HASH_SHIFT 11 #define FNHE_HASH_SIZE (1 << FNHE_HASH_SHIFT) #define FNHE_RECLAIM_DEPTH 5 struct fib_nh_common { struct net_device *nhc_dev; int nhc_oif; unsigned char nhc_scope; u8 nhc_family; u8 nhc_gw_family; unsigned char nhc_flags; struct lwtunnel_state *nhc_lwtstate; union { __be32 ipv4; struct in6_addr ipv6; } nhc_gw; int nhc_weight; atomic_t nhc_upper_bound; /* v4 specific, but allows fib6_nh with v4 routes */ struct rtable __rcu * __percpu *nhc_pcpu_rth_output; struct rtable __rcu *nhc_rth_input; struct fnhe_hash_bucket __rcu *nhc_exceptions; }; struct fib_nh { struct fib_nh_common nh_common; struct hlist_node nh_hash; struct fib_info *nh_parent; #ifdef CONFIG_IP_ROUTE_CLASSID __u32 nh_tclassid; #endif __be32 nh_saddr; int nh_saddr_genid; #define fib_nh_family nh_common.nhc_family #define fib_nh_dev nh_common.nhc_dev #define fib_nh_oif nh_common.nhc_oif #define fib_nh_flags nh_common.nhc_flags #define fib_nh_lws nh_common.nhc_lwtstate #define fib_nh_scope nh_common.nhc_scope #define fib_nh_gw_family nh_common.nhc_gw_family #define fib_nh_gw4 nh_common.nhc_gw.ipv4 #define fib_nh_gw6 nh_common.nhc_gw.ipv6 #define fib_nh_weight nh_common.nhc_weight #define fib_nh_upper_bound nh_common.nhc_upper_bound }; /* * This structure contains data shared by many of routes. */ struct nexthop; struct fib_info { struct hlist_node fib_hash; struct hlist_node fib_lhash; struct list_head nh_list; struct net *fib_net; int fib_treeref; refcount_t fib_clntref; unsigned int fib_flags; unsigned char fib_dead; unsigned char fib_protocol; unsigned char fib_scope; unsigned char fib_type; __be32 fib_prefsrc; u32 fib_tb_id; u32 fib_priority; struct dst_metrics *fib_metrics; #define fib_mtu fib_metrics->metrics[RTAX_MTU-1] #define fib_window fib_metrics->metrics[RTAX_WINDOW-1] #define fib_rtt fib_metrics->metrics[RTAX_RTT-1] #define fib_advmss fib_metrics->metrics[RTAX_ADVMSS-1] int fib_nhs; bool fib_nh_is_v6; bool nh_updated; struct nexthop *nh; struct rcu_head rcu; struct fib_nh fib_nh[]; }; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rule; #endif struct fib_table; struct fib_result { __be32 prefix; unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; u32 tclassid; struct fib_nh_common *nhc; struct fib_info *fi; struct fib_table *table; struct hlist_head *fa_head; }; struct fib_result_nl { __be32 fl_addr; /* To be looked up*/ u32 fl_mark; unsigned char fl_tos; unsigned char fl_scope; unsigned char tb_id_in; unsigned char tb_id; /* Results */ unsigned char prefixlen; unsigned char nh_sel; unsigned char type; unsigned char scope; int err; }; #ifdef CONFIG_IP_MULTIPLE_TABLES #define FIB_TABLE_HASHSZ 256 #else #define FIB_TABLE_HASHSZ 2 #endif __be32 fib_info_update_nhc_saddr(struct net *net, struct fib_nh_common *nhc, unsigned char scope); __be32 fib_result_prefsrc(struct net *net, struct fib_result *res); #define FIB_RES_NHC(res) ((res).nhc) #define FIB_RES_DEV(res) (FIB_RES_NHC(res)->nhc_dev) #define FIB_RES_OIF(res) (FIB_RES_NHC(res)->nhc_oif) struct fib_rt_info { struct fib_info *fi; u32 tb_id; __be32 dst; int dst_len; u8 tos; u8 type; u8 offload:1, trap:1, unused:6; }; struct fib_entry_notifier_info { struct fib_notifier_info info; /* must be first */ u32 dst; int dst_len; struct fib_info *fi; u8 tos; u8 type; u32 tb_id; }; struct fib_nh_notifier_info { struct fib_notifier_info info; /* must be first */ struct fib_nh *fib_nh; }; int call_fib4_notifier(struct notifier_block *nb, enum fib_event_type event_type, struct fib_notifier_info *info); int call_fib4_notifiers(struct net *net, enum fib_event_type event_type, struct fib_notifier_info *info); int __net_init fib4_notifier_init(struct net *net); void __net_exit fib4_notifier_exit(struct net *net); void fib_info_notify_update(struct net *net, struct nl_info *info); int fib_notify(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); struct fib_table { struct hlist_node tb_hlist; u32 tb_id; int tb_num_default; struct rcu_head rcu; unsigned long *tb_data; unsigned long __data[]; }; struct fib_dump_filter { u32 table_id; /* filter_set is an optimization that an entry is set */ bool filter_set; bool dump_routes; bool dump_exceptions; unsigned char protocol; unsigned char rt_type; unsigned int flags; struct net_device *dev; }; int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, struct fib_result *res, int fib_flags); int fib_table_insert(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_delete(struct net *, struct fib_table *, struct fib_config *, struct netlink_ext_ack *extack); int fib_table_dump(struct fib_table *table, struct sk_buff *skb, struct netlink_callback *cb, struct fib_dump_filter *filter); int fib_table_flush(struct net *net, struct fib_table *table, bool flush_all); struct fib_table *fib_trie_unmerge(struct fib_table *main_tb); void fib_table_flush_external(struct fib_table *table); void fib_free_table(struct fib_table *tb); #ifndef CONFIG_IP_MULTIPLE_TABLES #define TABLE_LOCAL_INDEX (RT_TABLE_LOCAL & (FIB_TABLE_HASHSZ - 1)) #define TABLE_MAIN_INDEX (RT_TABLE_MAIN & (FIB_TABLE_HASHSZ - 1)) static inline struct fib_table *fib_get_table(struct net *net, u32 id) { struct hlist_node *tb_hlist; struct hlist_head *ptr; ptr = id == RT_TABLE_LOCAL ? &net->ipv4.fib_table_hash[TABLE_LOCAL_INDEX] : &net->ipv4.fib_table_hash[TABLE_MAIN_INDEX]; tb_hlist = rcu_dereference_rtnl(hlist_first_rcu(ptr)); return hlist_entry(tb_hlist, struct fib_table, tb_hlist); } static inline struct fib_table *fib_new_table(struct net *net, u32 id) { return fib_get_table(net, id); } static inline int fib_lookup(struct net *net, const struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; rcu_read_lock(); tb = fib_get_table(net, RT_TABLE_MAIN); if (tb) err = fib_table_lookup(tb, flp, res, flags | FIB_LOOKUP_NOREF); if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return false; } static inline bool fib4_rule_default(const struct fib_rule *rule) { return true; } static inline int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack) { return 0; } static inline unsigned int fib4_rules_seq_read(struct net *net) { return 0; } static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { return false; } #else /* CONFIG_IP_MULTIPLE_TABLES */ int __net_init fib4_rules_init(struct net *net); void __net_exit fib4_rules_exit(struct net *net); struct fib_table *fib_new_table(struct net *net, u32 id); struct fib_table *fib_get_table(struct net *net, u32 id); int __fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags); static inline int fib_lookup(struct net *net, struct flowi4 *flp, struct fib_result *res, unsigned int flags) { struct fib_table *tb; int err = -ENETUNREACH; flags |= FIB_LOOKUP_NOREF; if (net->ipv4.fib_has_custom_rules) return __fib_lookup(net, flp, res, flags); rcu_read_lock(); res->tclassid = 0; tb = rcu_dereference_rtnl(net->ipv4.fib_main); if (tb) err = fib_table_lookup(tb, flp, res, flags); if (!err) goto out; tb = rcu_dereference_rtnl(net->ipv4.fib_default); if (tb) err = fib_table_lookup(tb, flp, res, flags); out: if (err == -EAGAIN) err = -ENETUNREACH; rcu_read_unlock(); return err; } static inline bool fib4_has_custom_rules(const struct net *net) { return net->ipv4.fib_has_custom_rules; } bool fib4_rule_default(const struct fib_rule *rule); int fib4_rules_dump(struct net *net, struct notifier_block *nb, struct netlink_ext_ack *extack); unsigned int fib4_rules_seq_read(struct net *net); static inline bool fib4_rules_early_flow_dissect(struct net *net, struct sk_buff *skb, struct flowi4 *fl4, struct flow_keys *flkeys) { unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP; if (!net->ipv4.fib_rules_require_fldissect) return false; skb_flow_dissect_flow_keys(skb, flkeys, flag); fl4->fl4_sport = flkeys->ports.src; fl4->fl4_dport = flkeys->ports.dst; fl4->flowi4_proto = flkeys->basic.ip_proto; return true; } #endif /* CONFIG_IP_MULTIPLE_TABLES */ /* Exported by fib_frontend.c */ extern const struct nla_policy rtm_ipv4_policy[]; void ip_fib_init(void); int fib_gw_from_via(struct fib_config *cfg, struct nlattr *nla, struct netlink_ext_ack *extack); __be32 fib_compute_spec_dst(struct sk_buff *skb); bool fib_info_nh_uses_dev(struct fib_info *fi, const struct net_device *dev); int fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, struct in_device *idev, u32 *itag); #ifdef CONFIG_IP_ROUTE_CLASSID static inline int fib_num_tclassid_users(struct net *net) { return atomic_read(&net->ipv4.fib_num_tclassid_users); } #else static inline int fib_num_tclassid_users(struct net *net) { return 0; } #endif int fib_unmerge(struct net *net); static inline bool nhc_l3mdev_matches_dev(const struct fib_nh_common *nhc, const struct net_device *dev) { if (nhc->nhc_dev == dev || l3mdev_master_ifindex_rcu(nhc->nhc_dev) == dev->ifindex) return true; return false; } /* Exported by fib_semantics.c */ int ip_fib_check_default(__be32 gw, struct net_device *dev); int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); int fib_sync_down_addr(struct net_device *dev, __be32 local); int fib_sync_up(struct net_device *dev, unsigned char nh_flags); void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); void fib_nhc_update_mtu(struct fib_nh_common *nhc, u32 new, u32 orig); #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4, const struct sk_buff *skb, struct flow_keys *flkeys); #endif int fib_check_nh(struct net *net, struct fib_nh *nh, u32 table, u8 scope, struct netlink_ext_ack *extack); void fib_select_multipath(struct fib_result *res, int hash); void fib_select_path(struct net *net, struct fib_result *res, struct flowi4 *fl4, const struct sk_buff *skb); int fib_nh_init(struct net *net, struct fib_nh *fib_nh, struct fib_config *cfg, int nh_weight, struct netlink_ext_ack *extack); void fib_nh_release(struct net *net, struct fib_nh *fib_nh); int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, struct nlattr *fc_encap, u16 fc_encap_type, void *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack); void fib_nh_common_release(struct fib_nh_common *nhc); /* Exported by fib_trie.c */ void fib_alias_hw_flags_set(struct net *net, const struct fib_rt_info *fri); void fib_trie_init(void); struct fib_table *fib_trie_table(u32 id, struct fib_table *alias); bool fib_lookup_good_nhc(const struct fib_nh_common *nhc, int fib_flags, const struct flowi4 *flp); static inline void fib_combine_itag(u32 *itag, const struct fib_result *res) { #ifdef CONFIG_IP_ROUTE_CLASSID struct fib_nh_common *nhc = res->nhc; #ifdef CONFIG_IP_MULTIPLE_TABLES u32 rtag; #endif if (nhc->nhc_family == AF_INET) { struct fib_nh *nh; nh = container_of(nhc, struct fib_nh, nh_common); *itag = nh->nh_tclassid << 16; } else { *itag = 0; } #ifdef CONFIG_IP_MULTIPLE_TABLES rtag = res->tclassid; if (*itag == 0) *itag = (rtag<<16); *itag |= (rtag>>16); #endif #endif } void fib_flush(struct net *net); void free_fib_info(struct fib_info *fi); static inline void fib_info_hold(struct fib_info *fi) { refcount_inc(&fi->fib_clntref); } static inline void fib_info_put(struct fib_info *fi) { if (refcount_dec_and_test(&fi->fib_clntref)) free_fib_info(fi); } #ifdef CONFIG_PROC_FS int __net_init fib_proc_init(struct net *net); void __net_exit fib_proc_exit(struct net *net); #else static inline int fib_proc_init(struct net *net) { return 0; } static inline void fib_proc_exit(struct net *net) { } #endif u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr); int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, struct fib_dump_filter *filter, struct netlink_callback *cb); int fib_nexthop_info(struct sk_buff *skb, const struct fib_nh_common *nh, u8 rt_family, unsigned char *flags, bool skip_oif); int fib_add_nexthop(struct sk_buff *skb, const struct fib_nh_common *nh, int nh_weight, u8 rt_family, u32 nh_tclassid); #endif /* _NET_FIB_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 /* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ #ifndef _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H #define _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H #ifndef __LITTLE_ENDIAN #define __LITTLE_ENDIAN 1234 #endif #ifndef __LITTLE_ENDIAN_BITFIELD #define __LITTLE_ENDIAN_BITFIELD #endif #include <linux/types.h> #include <linux/swab.h> #define __constant_htonl(x) ((__force __be32)___constant_swab32((x))) #define __constant_ntohl(x) ___constant_swab32((__force __be32)(x)) #define __constant_htons(x) ((__force __be16)___constant_swab16((x))) #define __constant_ntohs(x) ___constant_swab16((__force __be16)(x)) #define __constant_cpu_to_le64(x) ((__force __le64)(__u64)(x)) #define __constant_le64_to_cpu(x) ((__force __u64)(__le64)(x)) #define __constant_cpu_to_le32(x) ((__force __le32)(__u32)(x)) #define __constant_le32_to_cpu(x) ((__force __u32)(__le32)(x)) #define __constant_cpu_to_le16(x) ((__force __le16)(__u16)(x)) #define __constant_le16_to_cpu(x) ((__force __u16)(__le16)(x)) #define __constant_cpu_to_be64(x) ((__force __be64)___constant_swab64((x))) #define __constant_be64_to_cpu(x) ___constant_swab64((__force __u64)(__be64)(x)) #define __constant_cpu_to_be32(x) ((__force __be32)___constant_swab32((x))) #define __constant_be32_to_cpu(x) ___constant_swab32((__force __u32)(__be32)(x)) #define __constant_cpu_to_be16(x) ((__force __be16)___constant_swab16((x))) #define __constant_be16_to_cpu(x) ___constant_swab16((__force __u16)(__be16)(x)) #define __cpu_to_le64(x) ((__force __le64)(__u64)(x)) #define __le64_to_cpu(x) ((__force __u64)(__le64)(x)) #define __cpu_to_le32(x) ((__force __le32)(__u32)(x)) #define __le32_to_cpu(x) ((__force __u32)(__le32)(x)) #define __cpu_to_le16(x) ((__force __le16)(__u16)(x)) #define __le16_to_cpu(x) ((__force __u16)(__le16)(x)) #define __cpu_to_be64(x) ((__force __be64)__swab64((x))) #define __be64_to_cpu(x) __swab64((__force __u64)(__be64)(x)) #define __cpu_to_be32(x) ((__force __be32)__swab32((x))) #define __be32_to_cpu(x) __swab32((__force __u32)(__be32)(x)) #define __cpu_to_be16(x) ((__force __be16)__swab16((x))) #define __be16_to_cpu(x) __swab16((__force __u16)(__be16)(x)) static __always_inline __le64 __cpu_to_le64p(const __u64 *p) { return (__force __le64)*p; } static __always_inline __u64 __le64_to_cpup(const __le64 *p) { return (__force __u64)*p; } static __always_inline __le32 __cpu_to_le32p(const __u32 *p) { return (__force __le32)*p; } static __always_inline __u32 __le32_to_cpup(const __le32 *p) { return (__force __u32)*p; } static __always_inline __le16 __cpu_to_le16p(const __u16 *p) { return (__force __le16)*p; } static __always_inline __u16 __le16_to_cpup(const __le16 *p) { return (__force __u16)*p; } static __always_inline __be64 __cpu_to_be64p(const __u64 *p) { return (__force __be64)__swab64p(p); } static __always_inline __u64 __be64_to_cpup(const __be64 *p) { return __swab64p((__u64 *)p); } static __always_inline __be32 __cpu_to_be32p(const __u32 *p) { return (__force __be32)__swab32p(p); } static __always_inline __u32 __be32_to_cpup(const __be32 *p) { return __swab32p((__u32 *)p); } static __always_inline __be16 __cpu_to_be16p(const __u16 *p) { return (__force __be16)__swab16p(p); } static __always_inline __u16 __be16_to_cpup(const __be16 *p) { return __swab16p((__u16 *)p); } #define __cpu_to_le64s(x) do { (void)(x); } while (0) #define __le64_to_cpus(x) do { (void)(x); } while (0) #define __cpu_to_le32s(x) do { (void)(x); } while (0) #define __le32_to_cpus(x) do { (void)(x); } while (0) #define __cpu_to_le16s(x) do { (void)(x); } while (0) #define __le16_to_cpus(x) do { (void)(x); } while (0) #define __cpu_to_be64s(x) __swab64s((x)) #define __be64_to_cpus(x) __swab64s((x)) #define __cpu_to_be32s(x) __swab32s((x)) #define __be32_to_cpus(x) __swab32s((x)) #define __cpu_to_be16s(x) __swab16s((x)) #define __be16_to_cpus(x) __swab16s((x)) #endif /* _UAPI_LINUX_BYTEORDER_LITTLE_ENDIAN_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_USER_H #define _LINUX_SCHED_USER_H #include <linux/uidgid.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/ratelimit.h> /* * Some day this will be a full-fledged user tracking system.. */ struct user_struct { refcount_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_FANOTIFY atomic_t fanotify_listeners; #endif #ifdef CONFIG_EPOLL atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ #endif #ifdef CONFIG_POSIX_MQUEUE /* protected by mq_lock */ unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */ #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ /* Hash table maintenance information */ struct hlist_node uidhash_node; kuid_t uid; #if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL) || \ defined(CONFIG_NET) || defined(CONFIG_IO_URING) atomic_long_t locked_vm; #endif #ifdef CONFIG_WATCH_QUEUE atomic_t nr_watches; /* The number of watches this user currently has */ #endif /* Miscellaneous per-user rate limit */ struct ratelimit_state ratelimit; }; extern int uids_sysfs_init(void); extern struct user_struct *find_user(kuid_t); extern struct user_struct root_user; #define INIT_USER (&root_user) /* per-UID process charging. */ extern struct user_struct * alloc_uid(kuid_t); static inline struct user_struct *get_uid(struct user_struct *u) { refcount_inc(&u->__count); return u; } extern void free_uid(struct user_struct *); #endif /* _LINUX_SCHED_USER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fsnotify_backend.h> #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ struct inotify_event_info { struct fsnotify_event fse; u32 mask; int wd; u32 sync_cookie; int name_len; char name[]; }; struct inotify_inode_mark { struct fsnotify_mark fsn_mark; int wd; }; static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct inotify_event_info, fse); } extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; extern struct kmem_cache *inotify_inode_mark_cachep; #ifdef CONFIG_INOTIFY_USER static inline void dec_inotify_instances(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES); } static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts) { return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES); } static inline void dec_inotify_watches(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filemap #if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILEMAP_H #include <linux/types.h> #include <linux/tracepoint.h> #include <linux/mm.h> #include <linux/memcontrol.h> #include <linux/device.h> #include <linux/kdev_t.h> #include <linux/errseq.h> DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, TP_PROTO(struct page *page), TP_ARGS(page), TP_STRUCT__entry( __field(unsigned long, pfn) __field(unsigned long, i_ino) __field(unsigned long, index) __field(dev_t, s_dev) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->i_ino = page->mapping->host->i_ino; __entry->index = page->index; if (page->mapping->host->i_sb) __entry->s_dev = page->mapping->host->i_sb->s_dev; else __entry->s_dev = page->mapping->host->i_rdev; ), TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, pfn_to_page(__entry->pfn), __entry->pfn, __entry->index << PAGE_SHIFT) ); DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache, TP_PROTO(struct page *page), TP_ARGS(page) ); DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache, TP_PROTO(struct page *page), TP_ARGS(page) ); TRACE_EVENT(filemap_set_wb_err, TP_PROTO(struct address_space *mapping, errseq_t eseq), TP_ARGS(mapping, eseq), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, errseq) ), TP_fast_assign( __entry->i_ino = mapping->host->i_ino; __entry->errseq = eseq; if (mapping->host->i_sb) __entry->s_dev = mapping->host->i_sb->s_dev; else __entry->s_dev = mapping->host->i_rdev; ), TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->errseq) ); TRACE_EVENT(file_check_and_advance_wb_err, TP_PROTO(struct file *file, errseq_t old), TP_ARGS(file, old), TP_STRUCT__entry( __field(struct file *, file) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, old) __field(errseq_t, new) ), TP_fast_assign( __entry->file = file; __entry->i_ino = file->f_mapping->host->i_ino; if (file->f_mapping->host->i_sb) __entry->s_dev = file->f_mapping->host->i_sb->s_dev; else __entry->s_dev = file->f_mapping->host->i_rdev; __entry->old = old; __entry->new = file->f_wb_err; ), TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x", __entry->file, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->old, __entry->new) ); #endif /* _TRACE_FILEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MBCACHE_H #define _LINUX_MBCACHE_H #include <linux/hash.h> #include <linux/list_bl.h> #include <linux/list.h> #include <linux/atomic.h> #include <linux/fs.h> struct mb_cache; struct mb_cache_entry { /* List of entries in cache - protected by cache->c_list_lock */ struct list_head e_list; /* Hash table list - protected by hash chain bitlock */ struct hlist_bl_node e_hash_list; atomic_t e_refcnt; /* Key in hash - stable during lifetime of the entry */ u32 e_key; u32 e_referenced:1; u32 e_reusable:1; /* User provided value - stable during lifetime of the entry */ u64 e_value; }; struct mb_cache *mb_cache_create(int bucket_bits); void mb_cache_destroy(struct mb_cache *cache); int mb_cache_entry_create(struct mb_cache *cache, gfp_t mask, u32 key, u64 value, bool reusable); void __mb_cache_entry_free(struct mb_cache_entry *entry); static inline int mb_cache_entry_put(struct mb_cache *cache, struct mb_cache_entry *entry) { if (!atomic_dec_and_test(&entry->e_refcnt)) return 0; __mb_cache_entry_free(entry); return 1; } void mb_cache_entry_delete(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_get(struct mb_cache *cache, u32 key, u64 value); struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache, u32 key); struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache, struct mb_cache_entry *entry); void mb_cache_entry_touch(struct mb_cache *cache, struct mb_cache_entry *entry); #endif /* _LINUX_MBCACHE_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* memcontrol.h - Memory Controller * * Copyright IBM Corporation, 2007 * Author Balbir Singh <balbir@linux.vnet.ibm.com> * * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> */ #ifndef _LINUX_MEMCONTROL_H #define _LINUX_MEMCONTROL_H #include <linux/cgroup.h> #include <linux/vm_event_item.h> #include <linux/hardirq.h> #include <linux/jump_label.h> #include <linux/page_counter.h> #include <linux/vmpressure.h> #include <linux/eventfd.h> #include <linux/mm.h> #include <linux/vmstat.h> #include <linux/writeback.h> #include <linux/page-flags.h> struct mem_cgroup; struct obj_cgroup; struct page; struct mm_struct; struct kmem_cache; /* Cgroup-specific page state, on top of universal node page state */ enum memcg_stat_item { MEMCG_SWAP = NR_VM_NODE_STAT_ITEMS, MEMCG_SOCK, MEMCG_PERCPU_B, MEMCG_NR_STAT, }; enum memcg_memory_event { MEMCG_LOW, MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; unsigned int generation; }; #ifdef CONFIG_MEMCG #define MEM_CGROUP_ID_SHIFT 16 #define MEM_CGROUP_ID_MAX USHRT_MAX struct mem_cgroup_id { int id; refcount_t ref; }; /* * Per memcg event counter is incremented at every pagein/pageout. With THP, * it will be incremented by the number of pages. This counter is used * to trigger some periodic events. This is straightforward and better * than using jiffies etc. to handle periodic memcg event. */ enum mem_cgroup_events_target { MEM_CGROUP_TARGET_THRESH, MEM_CGROUP_TARGET_SOFTLIMIT, MEM_CGROUP_NTARGETS, }; struct memcg_vmstats_percpu { long stat[MEMCG_NR_STAT]; unsigned long events[NR_VM_EVENT_ITEMS]; unsigned long nr_page_events; unsigned long targets[MEM_CGROUP_NTARGETS]; }; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; /* scan generation, increased every round-trip */ unsigned int generation; }; struct lruvec_stat { long count[NR_VM_NODE_STAT_ITEMS]; }; /* * Bitmap of shrinker::id corresponding to memcg-aware shrinkers, * which have elements charged to this memcg. */ struct memcg_shrinker_map { struct rcu_head rcu; unsigned long map[]; }; /* * per-node information in memory controller. */ struct mem_cgroup_per_node { struct lruvec lruvec; /* Legacy local VM stats */ struct lruvec_stat __percpu *lruvec_stat_local; /* Subtree VM stats (batched updates) */ struct lruvec_stat __percpu *lruvec_stat_cpu; atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter; struct memcg_shrinker_map __rcu *shrinker_map; struct rb_node tree_node; /* RB tree node */ unsigned long usage_in_excess;/* Set to the value by which */ /* the soft limit is exceeded*/ bool on_tree; struct mem_cgroup *memcg; /* Back pointer, we cannot */ /* use container_of */ }; struct mem_cgroup_threshold { struct eventfd_ctx *eventfd; unsigned long threshold; }; /* For threshold */ struct mem_cgroup_threshold_ary { /* An array index points to threshold just below or equal to usage. */ int current_threshold; /* Size of entries[] */ unsigned int size; /* Array of thresholds */ struct mem_cgroup_threshold entries[]; }; struct mem_cgroup_thresholds { /* Primary thresholds array */ struct mem_cgroup_threshold_ary *primary; /* * Spare threshold array. * This is needed to make mem_cgroup_unregister_event() "never fail". * It must be able to store at least primary->size - 1 entries. */ struct mem_cgroup_threshold_ary *spare; }; enum memcg_kmem_state { KMEM_NONE, KMEM_ALLOCATED, KMEM_ONLINE, }; #if defined(CONFIG_SMP) struct memcg_padding { char x[0]; } ____cacheline_internodealigned_in_smp; #define MEMCG_PADDING(name) struct memcg_padding name; #else #define MEMCG_PADDING(name) #endif /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss * one in a given round, we're likely to catch it later if it keeps * foreign-dirtying, so a fairly low count should be enough. * * See mem_cgroup_track_foreign_dirty_slowpath() for details. */ #define MEMCG_CGWB_FRN_CNT 4 struct memcg_cgwb_frn { u64 bdi_id; /* bdi->id of the foreign inode */ int memcg_id; /* memcg->css.id of foreign inode */ u64 at; /* jiffies_64 at the time of dirtying */ struct wb_completion done; /* tracks in-flight foreign writebacks */ }; /* * Bucket for arbitrarily byte-sized objects charged to a memory * cgroup. The bucket can be reparented in one piece when the cgroup * is destroyed, without having to round up the individual references * of all live memory objects in the wild. */ struct obj_cgroup { struct percpu_ref refcnt; struct mem_cgroup *memcg; atomic_t nr_charged_bytes; union { struct list_head list; struct rcu_head rcu; }; }; /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, * to help the administrator determine what knobs to tune. */ struct mem_cgroup { struct cgroup_subsys_state css; /* Private memcg ID. Used to ID objects that outlive the cgroup */ struct mem_cgroup_id id; /* Accounted resources */ struct page_counter memory; /* Both v1 & v2 */ union { struct page_counter swap; /* v2 only */ struct page_counter memsw; /* v1 only */ }; /* Legacy consumer-oriented counters */ struct page_counter kmem; /* v1 only */ struct page_counter tcpmem; /* v1 only */ /* Range enforcement for interrupt charges */ struct work_struct high_work; unsigned long soft_limit; /* vmpressure notifications */ struct vmpressure vmpressure; /* * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; /* * Should the OOM killer kill all belonging tasks, had it kill one? */ bool oom_group; /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; int swappiness; /* OOM-Killer disable */ int oom_kill_disable; /* memory.events and memory.events.local */ struct cgroup_file events_file; struct cgroup_file events_local_file; /* handle for "memory.swap.events" */ struct cgroup_file swap_events_file; /* protect arrays of thresholds */ struct mutex thresholds_lock; /* thresholds for memory usage. RCU-protected */ struct mem_cgroup_thresholds thresholds; /* thresholds for mem+swap usage. RCU-protected */ struct mem_cgroup_thresholds memsw_thresholds; /* For oom notifier event fd */ struct list_head oom_notify; /* * Should we move charges of a task when a task is moved into this * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; /* taken only while moving_account > 0 */ spinlock_t move_lock; unsigned long move_lock_flags; MEMCG_PADDING(_pad1_); atomic_long_t vmstats[MEMCG_NR_STAT]; atomic_long_t vmevents[NR_VM_EVENT_ITEMS]; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; atomic_long_t memory_events_local[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; #ifdef CONFIG_MEMCG_KMEM /* Index in the kmem_cache->memcg_params.memcg_caches array */ int kmemcg_id; enum memcg_kmem_state kmem_state; struct obj_cgroup __rcu *objcg; struct list_head objcg_list; /* list of inherited objcgs */ #endif MEMCG_PADDING(_pad2_); /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; struct task_struct *move_lock_task; /* Legacy local VM stats and events */ struct memcg_vmstats_percpu __percpu *vmstats_local; /* Subtree VM stats and events (batched updates) */ struct memcg_vmstats_percpu __percpu *vmstats_percpu; #ifdef CONFIG_CGROUP_WRITEBACK struct list_head cgwb_list; struct wb_domain cgwb_domain; struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT]; #endif /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; #ifdef CONFIG_TRANSPARENT_HUGEPAGE struct deferred_split deferred_split_queue; #endif struct mem_cgroup_per_node *nodeinfo[0]; /* WARNING: nodeinfo must be the last member here */ }; /* * size of first charge trial. "32" comes from vmscan.c's magic value. * TODO: maybe necessary to use big numbers in big irons. */ #define MEMCG_CHARGE_BATCH 32U extern struct mem_cgroup *root_mem_cgroup; static __always_inline bool memcg_stat_item_in_bytes(int idx) { if (idx == MEMCG_PERCPU_B) return true; return vmstat_item_in_bytes(idx); } static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return (memcg == root_mem_cgroup); } static inline bool mem_cgroup_disabled(void) { return !cgroup_subsys_enabled(memory_cgrp_subsys); } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; if (mem_cgroup_disabled()) return; /* * There is no reclaim protection applied to a targeted reclaim. * We are special casing this specific case here because * mem_cgroup_protected calculation is not robust enough to keep * the protection invariant for calculated effective values for * parallel reclaimers with different reclaim target. This is * especially a problem for tail memcgs (as they have pages on LRU) * which would want to have effective values 0 for targeted reclaim * but a different value for external reclaim. * * Example * Let's have global and A's reclaim in parallel: * | * A (low=2G, usage = 3G, max = 3G, children_low_usage = 1.5G) * |\ * | C (low = 1G, usage = 2.5G) * B (low = 1G, usage = 0.5G) * * For the global reclaim * A.elow = A.low * B.elow = min(B.usage, B.low) because children_low_usage <= A.elow * C.elow = min(C.usage, C.low) * * With the effective values resetting we have A reclaim * A.elow = 0 * B.elow = B.low * C.elow = C.low * * If the global reclaim races with A's reclaim then * B.elow = C.elow = 0 because children_low_usage > A.elow) * is possible and reclaiming B would be violating the protection. * */ if (root == memcg) return; *min = READ_ONCE(memcg->memory.emin); *low = READ_ONCE(memcg->memory.elow); } void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg); static inline bool mem_cgroup_supports_protection(struct mem_cgroup *memcg) { /* * The root memcg doesn't account charges, and doesn't support * protection. */ return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg); } static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) { if (!mem_cgroup_supports_protection(memcg)) return false; return READ_ONCE(memcg->memory.elow) >= page_counter_read(&memcg->memory); } static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) { if (!mem_cgroup_supports_protection(memcg)) return false; return READ_ONCE(memcg->memory.emin) >= page_counter_read(&memcg->memory); } int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage); static struct mem_cgroup_per_node * mem_cgroup_nodeinfo(struct mem_cgroup *memcg, int nid) { return memcg->nodeinfo[nid]; } /** * mem_cgroup_lruvec - get the lru list vector for a memcg & node * @memcg: memcg of the wanted lruvec * * Returns the lru list vector holding pages for a given @memcg & * @node combination. This can be the node lruvec, if the memory * controller is disabled. */ static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { struct mem_cgroup_per_node *mz; struct lruvec *lruvec; if (mem_cgroup_disabled()) { lruvec = &pgdat->__lruvec; goto out; } if (!memcg) memcg = root_mem_cgroup; mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id); lruvec = &mz->lruvec; out: /* * Since a node can be onlined after the mem_cgroup was created, * we have to be prepared to initialize lruvec->pgdat here; * and if offlined then reonlined, we need to reinitialize it. */ if (unlikely(lruvec->pgdat != pgdat)) lruvec->pgdat = pgdat; return lruvec; } struct lruvec *mem_cgroup_page_lruvec(struct page *, struct pglist_data *); struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); struct mem_cgroup *get_mem_cgroup_from_page(struct page *page); static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css){ return css ? container_of(css, struct mem_cgroup, css) : NULL; } static inline bool obj_cgroup_tryget(struct obj_cgroup *objcg) { return percpu_ref_tryget(&objcg->refcnt); } static inline void obj_cgroup_get(struct obj_cgroup *objcg) { percpu_ref_get(&objcg->refcnt); } static inline void obj_cgroup_put(struct obj_cgroup *objcg) { percpu_ref_put(&objcg->refcnt); } /* * After the initialization objcg->memcg is always pointing at * a valid memcg, but can be atomically swapped to the parent memcg. * * The caller must ensure that the returned memcg won't be released: * e.g. acquire the rcu_read_lock or css_set_lock. */ static inline struct mem_cgroup *obj_cgroup_memcg(struct obj_cgroup *objcg) { return READ_ONCE(objcg->memcg); } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { if (memcg) css_put(&memcg->css); } #define mem_cgroup_from_counter(counter, member) \ container_of(counter, struct mem_cgroup, member) struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *, struct mem_cgroup *, struct mem_cgroup_reclaim_cookie *); void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *); int mem_cgroup_scan_tasks(struct mem_cgroup *, int (*)(struct task_struct *, void *), void *); static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return 0; return memcg->id.id; } struct mem_cgroup *mem_cgroup_from_id(unsigned short id); static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return mem_cgroup_from_css(seq_css(m)); } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { struct mem_cgroup_per_node *mz; if (mem_cgroup_disabled()) return NULL; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return mz->memcg; } /** * parent_mem_cgroup - find the accounting parent of a memcg * @memcg: memcg whose parent to find * * Returns the parent memcg, or NULL if this is the root or the memory * controller is in legacy no-hierarchy mode. */ static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { if (!memcg->memory.parent) return NULL; return mem_cgroup_from_counter(memcg->memory.parent, memory); } static inline bool mem_cgroup_is_descendant(struct mem_cgroup *memcg, struct mem_cgroup *root) { if (root == memcg) return true; if (!root->use_hierarchy) return false; return cgroup_is_descendant(memcg->css.cgroup, root->css.cgroup); } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { struct mem_cgroup *task_memcg; bool match = false; rcu_read_lock(); task_memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (task_memcg) match = mem_cgroup_is_descendant(task_memcg, memcg); rcu_read_unlock(); return match; } struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page); ino_t page_cgroup_ino(struct page *page); static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) return true; return !!(memcg->css.flags & CSS_ONLINE); } /* * For memory reclaim. */ int mem_cgroup_select_victim_node(struct mem_cgroup *memcg); void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, int zid, int nr_pages); static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { struct mem_cgroup_per_node *mz; mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec); return READ_ONCE(mz->lru_zone_size[zone_idx][lru]); } void mem_cgroup_handle_over_high(void); unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); unsigned long mem_cgroup_size(struct mem_cgroup *memcg); void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p); void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg); static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); current->in_user_fault = 1; } static inline void mem_cgroup_exit_user_fault(void) { WARN_ON(!current->in_user_fault); current->in_user_fault = 0; } static inline bool task_in_memcg_oom(struct task_struct *p) { return p->memcg_in_oom; } bool mem_cgroup_oom_synchronize(bool wait); struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, struct mem_cgroup *oom_domain); void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP extern bool cgroup_memory_noswap; #endif struct mem_cgroup *lock_page_memcg(struct page *page); void __unlock_page_memcg(struct mem_cgroup *memcg); void unlock_page_memcg(struct page *page); /* * idx can be of type enum memcg_stat_item or node_stat_item. * Keep in sync with memcg_exact_page_state(). */ static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { long x = atomic_long_read(&memcg->vmstats[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } /* * idx can be of type enum memcg_stat_item or node_stat_item. * Keep in sync with memcg_exact_page_state(). */ static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { long x = 0; int cpu; for_each_possible_cpu(cpu) x += per_cpu(memcg->vmstats_local->stat[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val); /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { unsigned long flags; local_irq_save(flags); __mod_memcg_state(memcg, idx, val); local_irq_restore(flags); } /** * mod_memcg_page_state - update page state statistics * @page: the page * @idx: page state item to account * @val: number of pages (positive or negative) * * The @page must be locked or the caller must use lock_page_memcg() * to prevent double accounting when the page is concurrently being * moved to another memcg: * * lock_page(page) or lock_page_memcg(page) * if (TestClearPageState(page)) * mod_memcg_page_state(page, state, -1); * unlock_page(page) or unlock_page_memcg(page) * * Kernel pages are an exception to this, since they'll never move. */ static inline void __mod_memcg_page_state(struct page *page, int idx, int val) { if (page->mem_cgroup) __mod_memcg_state(page->mem_cgroup, idx, val); } static inline void mod_memcg_page_state(struct page *page, int idx, int val) { if (page->mem_cgroup) mod_memcg_state(page->mem_cgroup, idx, val); } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); x = atomic_long_read(&pn->lruvec_stat[idx]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; long x = 0; int cpu; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); for_each_possible_cpu(cpu) x += per_cpu(pn->lruvec_stat_local->count[idx], cpu); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val); void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val); void mod_memcg_obj_state(void *p, int idx, int val); static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_slab_state(p, idx, val); local_irq_restore(flags); } static inline void mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_memcg_lruvec_state(lruvec, idx, val); local_irq_restore(flags); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_state(lruvec, idx, val); local_irq_restore(flags); } static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { struct page *head = compound_head(page); /* rmap on tail pages */ pg_data_t *pgdat = page_pgdat(page); struct lruvec *lruvec; /* Untracked pages have no memcg, no lruvec. Update only the node */ if (!head->mem_cgroup) { __mod_node_page_state(pgdat, idx, val); return; } lruvec = mem_cgroup_lruvec(head->mem_cgroup, pgdat); __mod_lruvec_state(lruvec, idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { unsigned long flags; local_irq_save(flags); __mod_lruvec_page_state(page, idx, val); local_irq_restore(flags); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count); static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { unsigned long flags; local_irq_save(flags); __count_memcg_events(memcg, idx, count); local_irq_restore(flags); } static inline void count_memcg_page_event(struct page *page, enum vm_event_item idx) { if (page->mem_cgroup) count_memcg_events(page->mem_cgroup, idx, 1); } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) count_memcg_events(memcg, idx, 1); rcu_read_unlock(); } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { bool swap_event = event == MEMCG_SWAP_HIGH || event == MEMCG_SWAP_MAX || event == MEMCG_SWAP_FAIL; atomic_long_inc(&memcg->memory_events_local[event]); if (!swap_event) cgroup_file_notify(&memcg->events_local_file); do { atomic_long_inc(&memcg->memory_events[event]); if (swap_event) cgroup_file_notify(&memcg->swap_events_file); else cgroup_file_notify(&memcg->events_file); if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) break; if (cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_LOCAL_EVENTS) break; } while ((memcg = parent_mem_cgroup(memcg)) && !mem_cgroup_is_root(memcg)); } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { struct mem_cgroup *memcg; if (mem_cgroup_disabled()) return; rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) memcg_memory_event(memcg, event); rcu_read_unlock(); } void split_page_memcg(struct page *head, unsigned int nr); #else /* CONFIG_MEMCG */ #define MEM_CGROUP_ID_SHIFT 0 #define MEM_CGROUP_ID_MAX 0 struct mem_cgroup; static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) { return true; } static inline bool mem_cgroup_disabled(void) { return true; } static inline void memcg_memory_event(struct mem_cgroup *memcg, enum memcg_memory_event event) { } static inline void memcg_memory_event_mm(struct mm_struct *mm, enum memcg_memory_event event) { } static inline void mem_cgroup_protection(struct mem_cgroup *root, struct mem_cgroup *memcg, unsigned long *min, unsigned long *low) { *min = *low = 0; } static inline void mem_cgroup_calculate_protection(struct mem_cgroup *root, struct mem_cgroup *memcg) { } static inline bool mem_cgroup_below_low(struct mem_cgroup *memcg) { return false; } static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) { return false; } static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { return 0; } static inline void mem_cgroup_uncharge(struct page *page) { } static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } static inline void mem_cgroup_migrate(struct page *old, struct page *new) { } static inline struct lruvec *mem_cgroup_lruvec(struct mem_cgroup *memcg, struct pglist_data *pgdat) { return &pgdat->__lruvec; } static inline struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat) { return &pgdat->__lruvec; } static inline struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) { return NULL; } static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { return true; } static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { return NULL; } static inline struct mem_cgroup *get_mem_cgroup_from_page(struct page *page) { return NULL; } static inline void mem_cgroup_put(struct mem_cgroup *memcg) { } static inline struct mem_cgroup * mem_cgroup_iter(struct mem_cgroup *root, struct mem_cgroup *prev, struct mem_cgroup_reclaim_cookie *reclaim) { return NULL; } static inline void mem_cgroup_iter_break(struct mem_cgroup *root, struct mem_cgroup *prev) { } static inline int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, int (*fn)(struct task_struct *, void *), void *arg) { return 0; } static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) { return 0; } static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) { WARN_ON_ONCE(id); /* XXX: This should always return root_mem_cgroup */ return NULL; } static inline struct mem_cgroup *mem_cgroup_from_seq(struct seq_file *m) { return NULL; } static inline struct mem_cgroup *lruvec_memcg(struct lruvec *lruvec) { return NULL; } static inline bool mem_cgroup_online(struct mem_cgroup *memcg) { return true; } static inline unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, enum lru_list lru, int zone_idx) { return 0; } static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } static inline unsigned long mem_cgroup_size(struct mem_cgroup *memcg) { return 0; } static inline void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct *p) { } static inline void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { } static inline struct mem_cgroup *lock_page_memcg(struct page *page) { return NULL; } static inline void __unlock_page_memcg(struct mem_cgroup *memcg) { } static inline void unlock_page_memcg(struct page *page) { } static inline void mem_cgroup_handle_over_high(void) { } static inline void mem_cgroup_enter_user_fault(void) { } static inline void mem_cgroup_exit_user_fault(void) { } static inline bool task_in_memcg_oom(struct task_struct *p) { return false; } static inline bool mem_cgroup_oom_synchronize(bool wait) { return false; } static inline struct mem_cgroup *mem_cgroup_get_oom_group( struct task_struct *victim, struct mem_cgroup *oom_domain) { return NULL; } static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) { } static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { return 0; } static inline unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx) { return 0; } static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int nr) { } static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int nr) { } static inline void __mod_memcg_page_state(struct page *page, int idx, int nr) { } static inline void mod_memcg_page_state(struct page *page, int idx, int nr) { } static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec, enum node_stat_item idx) { return node_page_state(lruvec_pgdat(lruvec), idx); } static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { } static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { mod_node_page_state(lruvec_pgdat(lruvec), idx, val); } static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { mod_node_page_state(page_pgdat(page), idx, val); } static inline void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); __mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val) { struct page *page = virt_to_head_page(p); mod_node_page_state(page_pgdat(page), idx, val); } static inline void mod_memcg_obj_state(void *p, int idx, int val) { } static inline unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned) { return 0; } static inline void split_page_memcg(struct page *head, unsigned int nr) { } static inline void count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { } static inline void count_memcg_page_event(struct page *page, int idx) { } static inline void count_memcg_event_mm(struct mm_struct *mm, enum vm_event_item idx) { } #endif /* CONFIG_MEMCG */ /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __inc_memcg_state(struct mem_cgroup *memcg, int idx) { __mod_memcg_state(memcg, idx, 1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __dec_memcg_state(struct mem_cgroup *memcg, int idx) { __mod_memcg_state(memcg, idx, -1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __inc_memcg_page_state(struct page *page, int idx) { __mod_memcg_page_state(page, idx, 1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __dec_memcg_page_state(struct page *page, int idx) { __mod_memcg_page_state(page, idx, -1); } static inline void __inc_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx) { __mod_lruvec_state(lruvec, idx, 1); } static inline void __dec_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx) { __mod_lruvec_state(lruvec, idx, -1); } static inline void __inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { __mod_lruvec_page_state(page, idx, 1); } static inline void __dec_lruvec_page_state(struct page *page, enum node_stat_item idx) { __mod_lruvec_page_state(page, idx, -1); } static inline void __inc_lruvec_slab_state(void *p, enum node_stat_item idx) { __mod_lruvec_slab_state(p, idx, 1); } static inline void __dec_lruvec_slab_state(void *p, enum node_stat_item idx) { __mod_lruvec_slab_state(p, idx, -1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void inc_memcg_state(struct mem_cgroup *memcg, int idx) { mod_memcg_state(memcg, idx, 1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void dec_memcg_state(struct mem_cgroup *memcg, int idx) { mod_memcg_state(memcg, idx, -1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void inc_memcg_page_state(struct page *page, int idx) { mod_memcg_page_state(page, idx, 1); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void dec_memcg_page_state(struct page *page, int idx) { mod_memcg_page_state(page, idx, -1); } static inline void inc_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx) { mod_lruvec_state(lruvec, idx, 1); } static inline void dec_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx) { mod_lruvec_state(lruvec, idx, -1); } static inline void inc_lruvec_page_state(struct page *page, enum node_stat_item idx) { mod_lruvec_page_state(page, idx, 1); } static inline void dec_lruvec_page_state(struct page *page, enum node_stat_item idx) { mod_lruvec_page_state(page, idx, -1); } static inline struct lruvec *parent_lruvec(struct lruvec *lruvec) { struct mem_cgroup *memcg; memcg = lruvec_memcg(lruvec); if (!memcg) return NULL; memcg = parent_mem_cgroup(memcg); if (!memcg) return NULL; return mem_cgroup_lruvec(memcg, lruvec_pgdat(lruvec)); } #ifdef CONFIG_CGROUP_WRITEBACK struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback); void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, struct bdi_writeback *wb); static inline void mem_cgroup_track_foreign_dirty(struct page *page, struct bdi_writeback *wb) { if (mem_cgroup_disabled()) return; if (unlikely(&page->mem_cgroup->css != wb->memcg_css)) mem_cgroup_track_foreign_dirty_slowpath(page, wb); } void mem_cgroup_flush_foreign(struct bdi_writeback *wb); #else /* CONFIG_CGROUP_WRITEBACK */ static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) { return NULL; } static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pwriteback) { } static inline void mem_cgroup_track_foreign_dirty(struct page *page, struct bdi_writeback *wb) { } static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb) { } #endif /* CONFIG_CGROUP_WRITEBACK */ struct sock; bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages); #ifdef CONFIG_MEMCG extern struct static_key_false memcg_sockets_enabled_key; #define mem_cgroup_sockets_enabled static_branch_unlikely(&memcg_sockets_enabled_key) void mem_cgroup_sk_alloc(struct sock *sk); void mem_cgroup_sk_free(struct sock *sk); static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_pressure) return true; do { if (time_before(jiffies, memcg->socket_pressure)) return true; } while ((memcg = parent_mem_cgroup(memcg))); return false; } extern int memcg_expand_shrinker_maps(int new_id); extern void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id); #else #define mem_cgroup_sockets_enabled 0 static inline void mem_cgroup_sk_alloc(struct sock *sk) { }; static inline void mem_cgroup_sk_free(struct sock *sk) { }; static inline bool mem_cgroup_under_socket_pressure(struct mem_cgroup *memcg) { return false; } static inline void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id) { } #endif #ifdef CONFIG_MEMCG_KMEM int __memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, unsigned int nr_pages); void __memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages); int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order); void __memcg_kmem_uncharge_page(struct page *page, int order); struct obj_cgroup *get_obj_cgroup_from_current(void); int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size); void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size); extern struct static_key_false memcg_kmem_enabled_key; extern int memcg_nr_cache_ids; void memcg_get_cache_ids(void); void memcg_put_cache_ids(void); /* * Helper macro to loop through all memcg-specific caches. Callers must still * check if the cache is valid (it is either valid or NULL). * the slab_mutex must be held when looping through those caches */ #define for_each_memcg_cache_index(_idx) \ for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++) static inline bool memcg_kmem_enabled(void) { return static_branch_likely(&memcg_kmem_enabled_key); } static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { if (memcg_kmem_enabled()) return __memcg_kmem_charge_page(page, gfp, order); return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { if (memcg_kmem_enabled()) __memcg_kmem_uncharge_page(page, order); } static inline int memcg_kmem_charge(struct mem_cgroup *memcg, gfp_t gfp, unsigned int nr_pages) { if (memcg_kmem_enabled()) return __memcg_kmem_charge(memcg, gfp, nr_pages); return 0; } static inline void memcg_kmem_uncharge(struct mem_cgroup *memcg, unsigned int nr_pages) { if (memcg_kmem_enabled()) __memcg_kmem_uncharge(memcg, nr_pages); } /* * helper for accessing a memcg's index. It will be used as an index in the * child cache array in kmem_cache, and also to derive its name. This function * will return -1 when this is not a kmem-limited memcg. */ static inline int memcg_cache_id(struct mem_cgroup *memcg) { return memcg ? memcg->kmemcg_id : -1; } struct mem_cgroup *mem_cgroup_from_obj(void *p); #else static inline int memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void memcg_kmem_uncharge_page(struct page *page, int order) { } static inline int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order) { return 0; } static inline void __memcg_kmem_uncharge_page(struct page *page, int order) { } #define for_each_memcg_cache_index(_idx) \ for (; NULL; ) static inline bool memcg_kmem_enabled(void) { return false; } static inline int memcg_cache_id(struct mem_cgroup *memcg) { return -1; } static inline void memcg_get_cache_ids(void) { } static inline void memcg_put_cache_ids(void) { } static inline struct mem_cgroup *mem_cgroup_from_obj(void *p) { return NULL; } #endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * NET Generic infrastructure for Network protocols. * * Authors: Arnaldo Carvalho de Melo <acme@conectiva.com.br> */ #ifndef _TIMEWAIT_SOCK_H #define _TIMEWAIT_SOCK_H #include <linux/slab.h> #include <linux/bug.h> #include <net/sock.h> struct timewait_sock_ops { struct kmem_cache *twsk_slab; char *twsk_slab_name; unsigned int twsk_obj_size; int (*twsk_unique)(struct sock *sk, struct sock *sktw, void *twp); void (*twsk_destructor)(struct sock *sk); }; static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) { if (sk->sk_prot->twsk_prot->twsk_unique != NULL) return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp); return 0; } static inline void twsk_destructor(struct sock *sk) { if (sk->sk_prot->twsk_prot->twsk_destructor != NULL) sk->sk_prot->twsk_prot->twsk_destructor(sk); } #endif /* _TIMEWAIT_SOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 /* SPDX-License-Identifier: GPL-2.0 */ /* * sysfs.h - definitions for the device driver filesystem * * Copyright (c) 2001,2002 Patrick Mochel * Copyright (c) 2004 Silicon Graphics, Inc. * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007 Tejun Heo <teheo@suse.de> * * Please see Documentation/filesystems/sysfs.rst for more information. */ #ifndef _SYSFS_H_ #define _SYSFS_H_ #include <linux/kernfs.h> #include <linux/compiler.h> #include <linux/errno.h> #include <linux/list.h> #include <linux/lockdep.h> #include <linux/kobject_ns.h> #include <linux/stat.h> #include <linux/atomic.h> struct kobject; struct module; struct bin_attribute; enum kobj_ns_type; struct attribute { const char *name; umode_t mode; #ifdef CONFIG_DEBUG_LOCK_ALLOC bool ignore_lockdep:1; struct lock_class_key *key; struct lock_class_key skey; #endif }; /** * sysfs_attr_init - initialize a dynamically allocated sysfs attribute * @attr: struct attribute to initialize * * Initialize a dynamically allocated struct attribute so we can * make lockdep happy. This is a new requirement for attributes * and initially this is only needed when lockdep is enabled. * Lockdep gives a nice error when your attribute is added to * sysfs if you don't have this. */ #ifdef CONFIG_DEBUG_LOCK_ALLOC #define sysfs_attr_init(attr) \ do { \ static struct lock_class_key __key; \ \ (attr)->key = &__key; \ } while (0) #else #define sysfs_attr_init(attr) do {} while (0) #endif /** * struct attribute_group - data structure used to declare an attribute group. * @name: Optional: Attribute group name * If specified, the attribute group will be created in * a new subdirectory with this name. * @is_visible: Optional: Function to return permissions associated with an * attribute of the group. Will be called repeatedly for each * non-binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if an attribute is not visible. The returned value * will replace static permissions defined in struct attribute. * @is_bin_visible: * Optional: Function to return permissions associated with a * binary attribute of the group. Will be called repeatedly * for each binary attribute in the group. Only read/write * permissions as well as SYSFS_PREALLOC are accepted. Must * return 0 if a binary attribute is not visible. The returned * value will replace static permissions defined in * struct bin_attribute. * @attrs: Pointer to NULL terminated list of attributes. * @bin_attrs: Pointer to NULL terminated list of binary attributes. * Either attrs or bin_attrs or both must be provided. */ struct attribute_group { const char *name; umode_t (*is_visible)(struct kobject *, struct attribute *, int); umode_t (*is_bin_visible)(struct kobject *, struct bin_attribute *, int); struct attribute **attrs; struct bin_attribute **bin_attrs; }; /* * Use these macros to make defining attributes easier. * See include/linux/device.h for examples.. */ #define SYSFS_PREALLOC 010000 #define __ATTR(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _show, \ .store = _store, \ } #define __ATTR_PREALLOC(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), \ .mode = SYSFS_PREALLOC | VERIFY_OCTAL_PERMISSIONS(_mode) },\ .show = _show, \ .store = _store, \ } #define __ATTR_RO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .show = _name##_show, \ } #define __ATTR_RO_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ } #define __ATTR_RW_MODE(_name, _mode) { \ .attr = { .name = __stringify(_name), \ .mode = VERIFY_OCTAL_PERMISSIONS(_mode) }, \ .show = _name##_show, \ .store = _name##_store, \ } #define __ATTR_WO(_name) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .store = _name##_store, \ } #define __ATTR_RW(_name) __ATTR(_name, 0644, _name##_show, _name##_store) #define __ATTR_NULL { .attr = { .name = NULL } } #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) { \ .attr = {.name = __stringify(_name), .mode = _mode, \ .ignore_lockdep = true }, \ .show = _show, \ .store = _store, \ } #else #define __ATTR_IGNORE_LOCKDEP __ATTR #endif #define __ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group *_name##_groups[] = { \ &_name##_group, \ NULL, \ } #define ATTRIBUTE_GROUPS(_name) \ static const struct attribute_group _name##_group = { \ .attrs = _name##_attrs, \ }; \ __ATTRIBUTE_GROUPS(_name) struct file; struct vm_area_struct; struct bin_attribute { struct attribute attr; size_t size; void *private; ssize_t (*read)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); ssize_t (*write)(struct file *, struct kobject *, struct bin_attribute *, char *, loff_t, size_t); int (*mmap)(struct file *, struct kobject *, struct bin_attribute *attr, struct vm_area_struct *vma); }; /** * sysfs_bin_attr_init - initialize a dynamically allocated bin_attribute * @attr: struct bin_attribute to initialize * * Initialize a dynamically allocated struct bin_attribute so we * can make lockdep happy. This is a new requirement for * attributes and initially this is only needed when lockdep is * enabled. Lockdep gives a nice error when your attribute is * added to sysfs if you don't have this. */ #define sysfs_bin_attr_init(bin_attr) sysfs_attr_init(&(bin_attr)->attr) /* macros to create static binary attributes easier */ #define __BIN_ATTR(_name, _mode, _read, _write, _size) { \ .attr = { .name = __stringify(_name), .mode = _mode }, \ .read = _read, \ .write = _write, \ .size = _size, \ } #define __BIN_ATTR_RO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0444 }, \ .read = _name##_read, \ .size = _size, \ } #define __BIN_ATTR_WO(_name, _size) { \ .attr = { .name = __stringify(_name), .mode = 0200 }, \ .write = _name##_write, \ .size = _size, \ } #define __BIN_ATTR_RW(_name, _size) \ __BIN_ATTR(_name, 0644, _name##_read, _name##_write, _size) #define __BIN_ATTR_NULL __ATTR_NULL #define BIN_ATTR(_name, _mode, _read, _write, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR(_name, _mode, _read, \ _write, _size) #define BIN_ATTR_RO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RO(_name, _size) #define BIN_ATTR_WO(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_WO(_name, _size) #define BIN_ATTR_RW(_name, _size) \ struct bin_attribute bin_attr_##_name = __BIN_ATTR_RW(_name, _size) struct sysfs_ops { ssize_t (*show)(struct kobject *, struct attribute *, char *); ssize_t (*store)(struct kobject *, struct attribute *, const char *, size_t); }; #ifdef CONFIG_SYSFS int __must_check sysfs_create_dir_ns(struct kobject *kobj, const void *ns); void sysfs_remove_dir(struct kobject *kobj); int __must_check sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns); int __must_check sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns); int __must_check sysfs_create_mount_point(struct kobject *parent_kobj, const char *name); void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name); int __must_check sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); int __must_check sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode); struct kernfs_node *sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr); void sysfs_unbreak_active_protection(struct kernfs_node *kn); void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns); bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr); void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr); int __must_check sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr); void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr); int __must_check sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name); int __must_check sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name); void sysfs_remove_link(struct kobject *kobj, const char *name); int sysfs_rename_link_ns(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name, const void *new_ns); void sysfs_delete_link(struct kobject *dir, struct kobject *targ, const char *name); int __must_check sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp); int __must_check sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups); int __must_check sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups); int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group); void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group); int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp); void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp); int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name); void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name); int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name); void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr); int __must_check sysfs_init(void); static inline void sysfs_enable_ns(struct kernfs_node *kn) { return kernfs_enable_ns(kn); } int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid); int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid); int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid); int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid); __printf(2, 3) int sysfs_emit(char *buf, const char *fmt, ...); __printf(3, 4) int sysfs_emit_at(char *buf, int at, const char *fmt, ...); #else /* CONFIG_SYSFS */ static inline int sysfs_create_dir_ns(struct kobject *kobj, const void *ns) { return 0; } static inline void sysfs_remove_dir(struct kobject *kobj) { } static inline int sysfs_rename_dir_ns(struct kobject *kobj, const char *new_name, const void *new_ns) { return 0; } static inline int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, const void *new_ns) { return 0; } static inline int sysfs_create_mount_point(struct kobject *parent_kobj, const char *name) { return 0; } static inline void sysfs_remove_mount_point(struct kobject *parent_kobj, const char *name) { } static inline int sysfs_create_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { return 0; } static inline int sysfs_create_files(struct kobject *kobj, const struct attribute * const *attr) { return 0; } static inline int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, umode_t mode) { return 0; } static inline struct kernfs_node * sysfs_break_active_protection(struct kobject *kobj, const struct attribute *attr) { return NULL; } static inline void sysfs_unbreak_active_protection(struct kernfs_node *kn) { } static inline void sysfs_remove_file_ns(struct kobject *kobj, const struct attribute *attr, const void *ns) { } static inline bool sysfs_remove_file_self(struct kobject *kobj, const struct attribute *attr) { return false; } static inline void sysfs_remove_files(struct kobject *kobj, const struct attribute * const *attr) { } static inline int sysfs_create_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { return 0; } static inline void sysfs_remove_bin_file(struct kobject *kobj, const struct bin_attribute *attr) { } static inline int sysfs_create_link(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline int sysfs_create_link_nowarn(struct kobject *kobj, struct kobject *target, const char *name) { return 0; } static inline void sysfs_remove_link(struct kobject *kobj, const char *name) { } static inline int sysfs_rename_link_ns(struct kobject *k, struct kobject *t, const char *old_name, const char *new_name, const void *ns) { return 0; } static inline void sysfs_delete_link(struct kobject *k, struct kobject *t, const char *name) { } static inline int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_groups(struct kobject *kobj, const struct attribute_group **groups) { return 0; } static inline int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_remove_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline void sysfs_remove_groups(struct kobject *kobj, const struct attribute_group **groups) { } static inline int sysfs_add_file_to_group(struct kobject *kobj, const struct attribute *attr, const char *group) { return 0; } static inline void sysfs_remove_file_from_group(struct kobject *kobj, const struct attribute *attr, const char *group) { } static inline int sysfs_merge_group(struct kobject *kobj, const struct attribute_group *grp) { return 0; } static inline void sysfs_unmerge_group(struct kobject *kobj, const struct attribute_group *grp) { } static inline int sysfs_add_link_to_group(struct kobject *kobj, const char *group_name, struct kobject *target, const char *link_name) { return 0; } static inline void sysfs_remove_link_from_group(struct kobject *kobj, const char *group_name, const char *link_name) { } static inline int compat_only_sysfs_link_entry_to_kobj(struct kobject *kobj, struct kobject *target_kobj, const char *target_name, const char *symlink_name) { return 0; } static inline void sysfs_notify(struct kobject *kobj, const char *dir, const char *attr) { } static inline int __must_check sysfs_init(void) { return 0; } static inline void sysfs_enable_ns(struct kernfs_node *kn) { } static inline int sysfs_file_change_owner(struct kobject *kobj, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_link_change_owner(struct kobject *kobj, struct kobject *targ, const char *name, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_change_owner(struct kobject *kobj, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_groups_change_owner(struct kobject *kobj, const struct attribute_group **groups, kuid_t kuid, kgid_t kgid) { return 0; } static inline int sysfs_group_change_owner(struct kobject *kobj, const struct attribute_group *groups, kuid_t kuid, kgid_t kgid) { return 0; } __printf(2, 3) static inline int sysfs_emit(char *buf, const char *fmt, ...) { return 0; } __printf(3, 4) static inline int sysfs_emit_at(char *buf, int at, const char *fmt, ...) { return 0; } #endif /* CONFIG_SYSFS */ static inline int __must_check sysfs_create_file(struct kobject *kobj, const struct attribute *attr) { return sysfs_create_file_ns(kobj, attr, NULL); } static inline void sysfs_remove_file(struct kobject *kobj, const struct attribute *attr) { sysfs_remove_file_ns(kobj, attr, NULL); } static inline int sysfs_rename_link(struct kobject *kobj, struct kobject *target, const char *old_name, const char *new_name) { return sysfs_rename_link_ns(kobj, target, old_name, new_name, NULL); } static inline void sysfs_notify_dirent(struct kernfs_node *kn) { kernfs_notify(kn); } static inline struct kernfs_node *sysfs_get_dirent(struct kernfs_node *parent, const char *name) { return kernfs_find_and_get(parent, name); } static inline struct kernfs_node *sysfs_get(struct kernfs_node *kn) { kernfs_get(kn); return kn; } static inline void sysfs_put(struct kernfs_node *kn) { kernfs_put(kn); } #endif /* _SYSFS_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the ICMP module. * * Version: @(#)icmp.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _ICMP_H #define _ICMP_H #include <linux/icmp.h> #include <net/inet_sock.h> #include <net/snmp.h> #include <net/ip.h> struct icmp_err { int errno; unsigned int fatal:1; }; extern const struct icmp_err icmp_err_convert[]; #define ICMP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.icmp_statistics, field) #define __ICMP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.icmp_statistics, field) #define ICMPMSGOUT_INC_STATS(net, field) SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field+256) #define ICMPMSGIN_INC_STATS(net, field) SNMP_INC_STATS_ATOMIC_LONG((net)->mib.icmpmsg_statistics, field) struct dst_entry; struct net_proto_family; struct sk_buff; struct net; void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, const struct ip_options *opt); static inline void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) { __icmp_send(skb_in, type, code, info, &IPCB(skb_in)->opt); } #if IS_ENABLED(CONFIG_NF_NAT) void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info); #else static inline void icmp_ndo_send(struct sk_buff *skb_in, int type, int code, __be32 info) { struct ip_options opts = { 0 }; __icmp_send(skb_in, type, code, info, &opts); } #endif int icmp_rcv(struct sk_buff *skb); int icmp_err(struct sk_buff *skb, u32 info); int icmp_init(void); void icmp_out_count(struct net *net, unsigned char type); #endif /* _ICMP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __FS_NOTIFY_FSNOTIFY_H_ #define __FS_NOTIFY_FSNOTIFY_H_ #include <linux/list.h> #include <linux/fsnotify.h> #include <linux/srcu.h> #include <linux/types.h> #include "../mount.h" static inline struct inode *fsnotify_conn_inode( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct inode, i_fsnotify_marks); } static inline struct mount *fsnotify_conn_mount( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct mount, mnt_fsnotify_marks); } static inline struct super_block *fsnotify_conn_sb( struct fsnotify_mark_connector *conn) { return container_of(conn->obj, struct super_block, s_fsnotify_marks); } /* destroy all events sitting in this groups notification queue */ extern void fsnotify_flush_notify(struct fsnotify_group *group); /* protects reads of inode and vfsmount marks list */ extern struct srcu_struct fsnotify_mark_srcu; /* compare two groups for sorting of marks lists */ extern int fsnotify_compare_groups(struct fsnotify_group *a, struct fsnotify_group *b); /* Destroy all marks attached to an object via connector */ extern void fsnotify_destroy_marks(fsnotify_connp_t *connp); /* run the list of all marks associated with inode and destroy them */ static inline void fsnotify_clear_marks_by_inode(struct inode *inode) { fsnotify_destroy_marks(&inode->i_fsnotify_marks); } /* run the list of all marks associated with vfsmount and destroy them */ static inline void fsnotify_clear_marks_by_mount(struct vfsmount *mnt) { fsnotify_destroy_marks(&real_mount(mnt)->mnt_fsnotify_marks); } /* run the list of all marks associated with sb and destroy them */ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) { fsnotify_destroy_marks(&sb->s_fsnotify_marks); } /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. */ extern void __fsnotify_update_child_dentry_flags(struct inode *inode); /* allocate and destroy and event holder to attach events to notification/access queues */ extern struct fsnotify_event_holder *fsnotify_alloc_event_holder(void); extern void fsnotify_destroy_event_holder(struct fsnotify_event_holder *holder); extern struct kmem_cache *fsnotify_mark_connector_cachep; #endif /* __FS_NOTIFY_FSNOTIFY_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H #include <linux/sched/coredump.h> #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_NEVER_DAX, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif }; struct kobject; struct kobj_attribute; ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PUD_SHIFT PUD_SHIFT #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; } if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { /* Explicitly disabled through madvise. */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; return true; } /* * to be used on vmas which are known to support THP. * Use transparent_hugepage_active otherwise */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (vma_is_temporary_stack(vma)) return false; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; if (vma_is_dax(vma)) return true; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) return !!(vma->vm_flags & VM_HUGEPAGE); return false; } bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); bool is_transparent_hugepage(struct page *page); bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ } while (0) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ __split_huge_pud(__vma, __pud, __address); \ } while (0) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); static inline int is_swap_pmd(pmd_t pmd) { return !pmd_none(pmd) && !pmd_present(pmd); } /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else return NULL; } /** * thp_head - Head page of a transparent huge page. * @page: Any page (tail, head or regular) found in the page cache. */ static inline struct page *thp_head(struct page *page) { return compound_head(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_ORDER; return 0; } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_NR; return 1; } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { return READ_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); } static inline bool is_huge_zero_pud(pud_t pud) { return false; } struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } static inline struct list_head *page_deferred_list(struct page *page) { /* * Global or memcg deferred list in the second tail pages is * occupied by compound_head. */ return &page[2].deferred_list; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) static inline struct page *thp_head(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return page; } static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 0; } static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 1; } static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { return false; } static inline bool transparent_hugepage_active(struct vm_area_struct *vma) { return false; } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { return false; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { return false; } static inline void prep_transhuge_page(struct page *page) {} static inline bool is_transparent_hugepage(struct page *page) { return false; } #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL static inline bool can_split_huge_page(struct page *page, int *pextra_pins) { BUILD_BUG(); return false; } static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { return 0; } static inline int split_huge_page(struct page *page) { return 0; } static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { BUG(); return 0; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { } static inline int is_swap_pmd(pmd_t pmd) { return 0; } static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { return NULL; } static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) { return 0; } static inline bool is_huge_zero_page(struct page *page) { return false; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; } static inline bool is_huge_zero_pud(pud_t pud) { return false; } static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline bool thp_migration_supported(void) { return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #endif /* _LINUX_HUGE_MM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_BLOCKGROUP_LOCK_H #define _LINUX_BLOCKGROUP_LOCK_H /* * Per-blockgroup locking for ext2 and ext3. * * Simple hashed spinlocking. */ #include <linux/spinlock.h> #include <linux/cache.h> #ifdef CONFIG_SMP #define NR_BG_LOCKS (4 << ilog2(NR_CPUS < 32 ? NR_CPUS : 32)) #else #define NR_BG_LOCKS 1 #endif struct bgl_lock { spinlock_t lock; } ____cacheline_aligned_in_smp; struct blockgroup_lock { struct bgl_lock locks[NR_BG_LOCKS]; }; static inline void bgl_lock_init(struct blockgroup_lock *bgl) { int i; for (i = 0; i < NR_BG_LOCKS; i++) spin_lock_init(&bgl->locks[i].lock); } static inline spinlock_t * bgl_lock_ptr(struct blockgroup_lock *bgl, unsigned int block_group) { return &bgl->locks[block_group & (NR_BG_LOCKS-1)].lock; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NET_SCM_H #define __LINUX_NET_SCM_H #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/sched/signal.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) */ #define SCM_MAX_FD 253 struct scm_creds { u32 pid; kuid_t uid; kgid_t gid; }; struct scm_fp_list { short count; short max; struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; struct scm_cookie { struct pid *pid; /* Skb credentials */ struct scm_fp_list *fp; /* Passed files */ struct scm_creds creds; /* Skb credentials */ #ifdef CONFIG_SECURITY_NETWORK u32 secid; /* Passed security ID */ #endif }; void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm); void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm); void __scm_destroy(struct scm_cookie *scm); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl); #ifdef CONFIG_SECURITY_NETWORK static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { security_socket_getpeersec_dgram(sock, NULL, &scm->secid); } #else static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; } static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) { scm_destroy_cred(scm); if (scm->fp) __scm_destroy(scm); } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); scm->creds.uid = INVALID_UID; scm->creds.gid = INVALID_GID; if (forcecreds) scm_set_cred(scm, task_tgid(current), current_uid(), current_gid()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; return __scm_send(sock, msg, scm); } #ifdef CONFIG_SECURITY_NETWORK static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { char *secdata; u32 seclen; int err; if (test_bit(SOCK_PASSSEC, &sock->flags)) { err = security_secid_to_secctx(scm->secid, &secdata, &seclen); if (!err) { put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata); security_release_secctx(secdata, seclen); } } } #else static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return; } if (test_bit(SOCK_PASSCRED, &sock->flags)) { struct user_namespace *current_ns = current_user_ns(); struct ucred ucreds = { .pid = scm->creds.pid, .uid = from_kuid_munged(current_ns, scm->creds.uid), .gid = from_kgid_munged(current_ns, scm->creds.gid), }; put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } scm_destroy_cred(scm); scm_passec(sock, msg, scm); if (!scm->fp) return; scm_detach_fds(msg, scm); } #endif /* __LINUX_NET_SCM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Hash algorithms. * * Copyright (c) 2008 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_HASH_H #define _CRYPTO_INTERNAL_HASH_H #include <crypto/algapi.h> #include <crypto/hash.h> struct ahash_request; struct scatterlist; struct crypto_hash_walk { char *data; unsigned int offset; unsigned int alignmask; struct page *pg; unsigned int entrylen; unsigned int total; struct scatterlist *sg; unsigned int flags; }; struct ahash_instance { void (*free)(struct ahash_instance *inst); union { struct { char head[offsetof(struct ahash_alg, halg.base)]; struct crypto_instance base; } s; struct ahash_alg alg; }; }; struct shash_instance { void (*free)(struct shash_instance *inst); union { struct { char head[offsetof(struct shash_alg, base)]; struct crypto_instance base; } s; struct shash_alg alg; }; }; struct crypto_ahash_spawn { struct crypto_spawn base; }; struct crypto_shash_spawn { struct crypto_spawn base; }; int crypto_hash_walk_done(struct crypto_hash_walk *walk, int err); int crypto_hash_walk_first(struct ahash_request *req, struct crypto_hash_walk *walk); static inline int crypto_hash_walk_last(struct crypto_hash_walk *walk) { return !(walk->entrylen | walk->total); } int crypto_register_ahash(struct ahash_alg *alg); void crypto_unregister_ahash(struct ahash_alg *alg); int crypto_register_ahashes(struct ahash_alg *algs, int count); void crypto_unregister_ahashes(struct ahash_alg *algs, int count); int ahash_register_instance(struct crypto_template *tmpl, struct ahash_instance *inst); bool crypto_shash_alg_has_setkey(struct shash_alg *alg); static inline bool crypto_shash_alg_needs_key(struct shash_alg *alg) { return crypto_shash_alg_has_setkey(alg) && !(alg->base.cra_flags & CRYPTO_ALG_OPTIONAL_KEY); } bool crypto_hash_alg_has_setkey(struct hash_alg_common *halg); int crypto_grab_ahash(struct crypto_ahash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_ahash(struct crypto_ahash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct hash_alg_common *crypto_spawn_ahash_alg( struct crypto_ahash_spawn *spawn) { return __crypto_hash_alg_common(spawn->base.alg); } int crypto_register_shash(struct shash_alg *alg); void crypto_unregister_shash(struct shash_alg *alg); int crypto_register_shashes(struct shash_alg *algs, int count); void crypto_unregister_shashes(struct shash_alg *algs, int count); int shash_register_instance(struct crypto_template *tmpl, struct shash_instance *inst); void shash_free_singlespawn_instance(struct shash_instance *inst); int crypto_grab_shash(struct crypto_shash_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_shash(struct crypto_shash_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct shash_alg *crypto_spawn_shash_alg( struct crypto_shash_spawn *spawn) { return __crypto_shash_alg(spawn->base.alg); } int shash_ahash_update(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_finup(struct ahash_request *req, struct shash_desc *desc); int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc); int crypto_init_shash_ops_async(struct crypto_tfm *tfm); static inline void *crypto_ahash_ctx(struct crypto_ahash *tfm) { return crypto_tfm_ctx(crypto_ahash_tfm(tfm)); } static inline struct ahash_alg *__crypto_ahash_alg(struct crypto_alg *alg) { return container_of(__crypto_hash_alg_common(alg), struct ahash_alg, halg); } static inline void crypto_ahash_set_reqsize(struct crypto_ahash *tfm, unsigned int reqsize) { tfm->reqsize = reqsize; } static inline struct crypto_instance *ahash_crypto_instance( struct ahash_instance *inst) { return &inst->s.base; } static inline struct ahash_instance *ahash_instance( struct crypto_instance *inst) { return container_of(inst, struct ahash_instance, s.base); } static inline struct ahash_instance *ahash_alg_instance( struct crypto_ahash *ahash) { return ahash_instance(crypto_tfm_alg_instance(&ahash->base)); } static inline void *ahash_instance_ctx(struct ahash_instance *inst) { return crypto_instance_ctx(ahash_crypto_instance(inst)); } static inline void ahash_request_complete(struct ahash_request *req, int err) { req->base.complete(&req->base, err); } static inline u32 ahash_request_flags(struct ahash_request *req) { return req->base.flags; } static inline struct crypto_ahash *crypto_spawn_ahash( struct crypto_ahash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline int ahash_enqueue_request(struct crypto_queue *queue, struct ahash_request *request) { return crypto_enqueue_request(queue, &request->base); } static inline struct ahash_request *ahash_dequeue_request( struct crypto_queue *queue) { return ahash_request_cast(crypto_dequeue_request(queue)); } static inline void *crypto_shash_ctx(struct crypto_shash *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline struct crypto_instance *shash_crypto_instance( struct shash_instance *inst) { return &inst->s.base; } static inline struct shash_instance *shash_instance( struct crypto_instance *inst) { return container_of(inst, struct shash_instance, s.base); } static inline struct shash_instance *shash_alg_instance( struct crypto_shash *shash) { return shash_instance(crypto_tfm_alg_instance(&shash->base)); } static inline void *shash_instance_ctx(struct shash_instance *inst) { return crypto_instance_ctx(shash_crypto_instance(inst)); } static inline struct crypto_shash *crypto_spawn_shash( struct crypto_shash_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void *crypto_shash_ctx_aligned(struct crypto_shash *tfm) { return crypto_tfm_ctx_aligned(&tfm->base); } static inline struct crypto_shash *__crypto_shash_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_shash, base); } #endif /* _CRYPTO_INTERNAL_HASH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 /* SPDX-License-Identifier: GPL-2.0 */ /* * Copyright (c) 2013 Red Hat, Inc. and Parallels Inc. All rights reserved. * Authors: David Chinner and Glauber Costa * * Generic LRU infrastructure */ #ifndef _LRU_LIST_H #define _LRU_LIST_H #include <linux/list.h> #include <linux/nodemask.h> #include <linux/shrinker.h> struct mem_cgroup; /* list_lru_walk_cb has to always return one of those */ enum lru_status { LRU_REMOVED, /* item removed from list */ LRU_REMOVED_RETRY, /* item removed, but lock has been dropped and reacquired */ LRU_ROTATE, /* item referenced, give another pass */ LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ }; struct list_lru_one { struct list_head list; /* may become negative during memcg reparenting */ long nr_items; }; struct list_lru_memcg { struct rcu_head rcu; /* array of per cgroup lists, indexed by memcg_cache_id */ struct list_lru_one *lru[]; }; struct list_lru_node { /* protects all lists on the node, including per cgroup */ spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; #ifdef CONFIG_MEMCG_KMEM /* for cgroup aware lrus points to per cgroup lists, otherwise NULL */ struct list_lru_memcg __rcu *memcg_lrus; #endif long nr_items; } ____cacheline_aligned_in_smp; struct list_lru { struct list_lru_node *node; #ifdef CONFIG_MEMCG_KMEM struct list_head list; int shrinker_id; bool memcg_aware; #endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct lock_class_key *key, struct shrinker *shrinker); #define list_lru_init(lru) \ __list_lru_init((lru), false, NULL, NULL) #define list_lru_init_key(lru, key) \ __list_lru_init((lru), false, (key), NULL) #define list_lru_init_memcg(lru, shrinker) \ __list_lru_init((lru), true, NULL, shrinker) int memcg_update_all_list_lrus(int num_memcgs); void memcg_drain_all_list_lrus(int src_idx, struct mem_cgroup *dst_memcg); /** * list_lru_add: add an element to the lru list's tail * @list_lru: the lru pointer * @item: the item to be added. * * If the element is already part of a list, this function returns doing * nothing. Therefore the caller does not need to keep state about whether or * not the element already belongs in the list and is allowed to lazy update * it. Note however that this is valid for *a* list, not *this* list. If * the caller organize itself in a way that elements can be in more than * one type of list, it is up to the caller to fully remove the item from * the previous list (with list_lru_del() for instance) before moving it * to @list_lru * * Return value: true if the list was updated, false otherwise */ bool list_lru_add(struct list_lru *lru, struct list_head *item); /** * list_lru_del: delete an element to the lru list * @list_lru: the lru pointer * @item: the item to be deleted. * * This function works analogously as list_lru_add in terms of list * manipulation. The comments about an element already pertaining to * a list are also valid for list_lru_del. * * Return value: true if the list was updated, false otherwise */ bool list_lru_del(struct list_lru *lru, struct list_head *item); /** * list_lru_count_one: return the number of objects currently held by @lru * @lru: the lru pointer. * @nid: the node id to count from. * @memcg: the cgroup to count from. * * Always return a non-negative number, 0 for empty lists. There is no * guarantee that the list is not updated while the count is being computed. * Callers that want such a guarantee need to provide an outer lock. */ unsigned long list_lru_count_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg); unsigned long list_lru_count_node(struct list_lru *lru, int nid); static inline unsigned long list_lru_shrink_count(struct list_lru *lru, struct shrink_control *sc) { return list_lru_count_one(lru, sc->nid, sc->memcg); } static inline unsigned long list_lru_count(struct list_lru *lru) { long count = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) count += list_lru_count_node(lru, nid); return count; } void list_lru_isolate(struct list_lru_one *list, struct list_head *item); void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, struct list_lru_one *list, spinlock_t *lock, void *cb_arg); /** * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is resposible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * This function will scan all elements in a particular list_lru, calling the * @isolate callback for each of those items, along with the current list * spinlock and a caller-provided opaque. The @isolate callback can choose to * drop the lock internally, but *must* return with the lock held. The callback * will return an enum lru_status telling the list_lru infrastructure what to * do with the object being scanned. * * Please note that nr_to_walk does not mean how many objects will be freed, * just how many objects will be scanned. * * Return value: the number of objects effectively removed from the LRU. */ unsigned long list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); /** * list_lru_walk_one_irq: walk a list_lru, isolating and disposing freeable items. * @lru: the lru pointer. * @nid: the node id to scan from. * @memcg: the cgroup to scan from. * @isolate: callback function that is resposible for deciding what to do with * the item currently being scanned * @cb_arg: opaque type that will be passed to @isolate * @nr_to_walk: how many items to scan. * * Same as @list_lru_walk_one except that the spinlock is acquired with * spin_lock_irq(). */ unsigned long list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); unsigned long list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk); static inline unsigned long list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_shrink_walk_irq(struct list_lru *lru, struct shrink_control *sc, list_lru_walk_cb isolate, void *cb_arg) { return list_lru_walk_one_irq(lru, sc->nid, sc->memcg, isolate, cb_arg, &sc->nr_to_scan); } static inline unsigned long list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, void *cb_arg, unsigned long nr_to_walk) { long isolated = 0; int nid; for_each_node_state(nid, N_NORMAL_MEMORY) { isolated += list_lru_walk_node(lru, nid, isolate, cb_arg, &nr_to_walk); if (nr_to_walk <= 0) break; } return isolated; } #endif /* _LRU_LIST_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock_bh(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock_bh(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock_bh(); n = __ipv4_neigh_lookup_noref(dev, key); if (n) { unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542 3543 3544 3545 3546 3547 3548 3549 3550 3551 3552 3553 3554 3555 3556 3557 3558 3559 3560 3561 3562 3563 3564 3565 3566 3567 3568 3569 3570 3571 3572 3573 3574 3575 3576 3577 3578 3579 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 3595 3596 3597 3598 3599 3600 3601 3602 3603 3604 3605 3606 3607 3608 3609 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 3649 3650 3651 3652 3653 3654 3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666 3667 3668 3669 3670 3671 3672 3673 3674 3675 3676 3677 3678 3679 3680 3681 3682 3683 3684 3685 3686 3687 3688 3689 3690 3691 3692 3693 3694 3695 3696 3697 3698 3699 3700 3701 3702 3703 3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796 3797 3798 3799 3800 3801 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817 3818 3819 3820 3821 3822 3823 3824 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 3894 3895 3896 3897 3898 3899 3900 3901 3902 3903 3904 3905 3906 3907 3908 3909 3910 3911 3912 3913 3914 3915 3916 3917 3918 3919 3920 3921 3922 3923 3924 3925 3926 3927 3928 3929 3930 3931 3932 3933 3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 3981 3982 3983 3984 3985 3986 3987 3988 3989 3990 3991 3992 3993 3994 3995 3996 3997 3998 3999 4000 4001 4002 4003 4004 4005 4006 4007 4008 4009 4010 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 4035 4036 4037 4038 4039 4040 4041 4042 4043 4044 4045 4046 4047 4048 4049 4050 4051 4052 4053 4054 4055 4056 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 4093 4094 4095 4096 4097 4098 4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109 4110 4111 4112 4113 4114 4115 4116 4117 4118 4119 4120 4121 4122 4123 4124 4125 4126 4127 4128 4129 4130 4131 4132 4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143 4144 4145 4146 4147 4148 4149 4150 4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 4165 4166 4167 4168 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 4199 4200 4201 4202 4203 4204 4205 4206 4207 4208 4209 4210 4211 4212 4213 4214 4215 4216 4217 4218 4219 4220 4221 4222 4223 4224 4225 4226 4227 4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238 4239 4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 4254 4255 4256 4257 4258 4259 4260 4261 4262 4263 4264 4265 4266 4267 4268 4269 4270 4271 4272 4273 4274 4275 4276 4277 4278 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 4294 4295 4296 4297 4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 4364 4365 4366 4367 4368 4369 4370 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 4404 4405 4406 4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432 4433 4434 4435 4436 4437 4438 4439 4440 4441 4442 4443 4444 4445 4446 4447 4448 4449 4450 4451 4452 4453 4454 4455 4456 4457 4458 4459 4460 4461 4462 4463 4464 4465 4466 4467 4468 4469 4470 4471 4472 4473 4474 4475 // SPDX-License-Identifier: GPL-2.0-only /* * xfrm_policy.c * * Changes: * Mitsuru KANDA @USAGI * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro <kunihiro@ipinfusion.com> * IPv6 support * Kazunori MIYAZAWA @USAGI * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins <derek@ihtfp.com> Add the post_input processor * */ #include <linux/err.h> #include <linux/slab.h> #include <linux/kmod.h> #include <linux/list.h> #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/notifier.h> #include <linux/netdevice.h> #include <linux/netfilter.h> #include <linux/module.h> #include <linux/cache.h> #include <linux/cpu.h> #include <linux/audit.h> #include <linux/rhashtable.h> #include <linux/if_tunnel.h> #include <net/dst.h> #include <net/flow.h> #include <net/xfrm.h> #include <net/ip.h> #if IS_ENABLED(CONFIG_IPV6_MIP6) #include <net/mip6.h> #endif #ifdef CONFIG_XFRM_STATISTICS #include <net/snmp.h> #endif #ifdef CONFIG_XFRM_ESPINTCP #include <net/espintcp.h> #endif #include "xfrm_hash.h" #define XFRM_QUEUE_TMO_MIN ((unsigned)(HZ/10)) #define XFRM_QUEUE_TMO_MAX ((unsigned)(60*HZ)) #define XFRM_MAX_QUEUE_LEN 100 struct xfrm_flo { struct dst_entry *dst_orig; u8 flags; }; /* prefixes smaller than this are stored in lists, not trees. */ #define INEXACT_PREFIXLEN_IPV4 16 #define INEXACT_PREFIXLEN_IPV6 48 struct xfrm_pol_inexact_node { struct rb_node node; union { xfrm_address_t addr; struct rcu_head rcu; }; u8 prefixlen; struct rb_root root; /* the policies matching this node, can be empty list */ struct hlist_head hhead; }; /* xfrm inexact policy search tree: * xfrm_pol_inexact_bin = hash(dir,type,family,if_id); * | * +---- root_d: sorted by daddr:prefix * | | * | xfrm_pol_inexact_node * | | * | +- root: sorted by saddr/prefix * | | | * | | xfrm_pol_inexact_node * | | | * | | + root: unused * | | | * | | + hhead: saddr:daddr policies * | | * | +- coarse policies and all any:daddr policies * | * +---- root_s: sorted by saddr:prefix * | | * | xfrm_pol_inexact_node * | | * | + root: unused * | | * | + hhead: saddr:any policies * | * +---- coarse policies and all any:any policies * * Lookups return four candidate lists: * 1. any:any list from top-level xfrm_pol_inexact_bin * 2. any:daddr list from daddr tree * 3. saddr:daddr list from 2nd level daddr tree * 4. saddr:any list from saddr tree * * This result set then needs to be searched for the policy with * the lowest priority. If two results have same prio, youngest one wins. */ struct xfrm_pol_inexact_key { possible_net_t net; u32 if_id; u16 family; u8 dir, type; }; struct xfrm_pol_inexact_bin { struct xfrm_pol_inexact_key k; struct rhash_head head; /* list containing '*:*' policies */ struct hlist_head hhead; seqcount_spinlock_t count; /* tree sorted by daddr/prefix */ struct rb_root root_d; /* tree sorted by saddr/prefix */ struct rb_root root_s; /* slow path below */ struct list_head inexact_bins; struct rcu_head rcu; }; enum xfrm_pol_inexact_candidate_type { XFRM_POL_CAND_BOTH, XFRM_POL_CAND_SADDR, XFRM_POL_CAND_DADDR, XFRM_POL_CAND_ANY, XFRM_POL_CAND_MAX, }; struct xfrm_pol_inexact_candidates { struct hlist_head *res[XFRM_POL_CAND_MAX]; }; static DEFINE_SPINLOCK(xfrm_if_cb_lock); static struct xfrm_if_cb const __rcu *xfrm_if_cb __read_mostly; static DEFINE_SPINLOCK(xfrm_policy_afinfo_lock); static struct xfrm_policy_afinfo const __rcu *xfrm_policy_afinfo[AF_INET6 + 1] __read_mostly; static struct kmem_cache *xfrm_dst_cache __ro_after_init; static struct rhashtable xfrm_policy_inexact_table; static const struct rhashtable_params xfrm_pol_inexact_params; static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr); static int stale_bundle(struct dst_entry *dst); static int xfrm_bundle_ok(struct xfrm_dst *xdst); static void xfrm_policy_queue_process(struct timer_list *t); static void __xfrm_policy_link(struct xfrm_policy *pol, int dir); static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id); static struct xfrm_policy * xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl); static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, struct xfrm_policy *policy); static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr); static inline bool xfrm_pol_hold_rcu(struct xfrm_policy *policy) { return refcount_inc_not_zero(&policy->refcnt); } static inline bool __xfrm4_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi4 *fl4 = &fl->u.ip4; return addr4_match(fl4->daddr, sel->daddr.a4, sel->prefixlen_d) && addr4_match(fl4->saddr, sel->saddr.a4, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl4->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl4->uli) ^ sel->sport) & sel->sport_mask) && (fl4->flowi4_proto == sel->proto || !sel->proto) && (fl4->flowi4_oif == sel->ifindex || !sel->ifindex); } static inline bool __xfrm6_selector_match(const struct xfrm_selector *sel, const struct flowi *fl) { const struct flowi6 *fl6 = &fl->u.ip6; return addr_match(&fl6->daddr, &sel->daddr, sel->prefixlen_d) && addr_match(&fl6->saddr, &sel->saddr, sel->prefixlen_s) && !((xfrm_flowi_dport(fl, &fl6->uli) ^ sel->dport) & sel->dport_mask) && !((xfrm_flowi_sport(fl, &fl6->uli) ^ sel->sport) & sel->sport_mask) && (fl6->flowi6_proto == sel->proto || !sel->proto) && (fl6->flowi6_oif == sel->ifindex || !sel->ifindex); } bool xfrm_selector_match(const struct xfrm_selector *sel, const struct flowi *fl, unsigned short family) { switch (family) { case AF_INET: return __xfrm4_selector_match(sel, fl); case AF_INET6: return __xfrm6_selector_match(sel, fl); } return false; } static const struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family) { const struct xfrm_policy_afinfo *afinfo; if (unlikely(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return NULL; rcu_read_lock(); afinfo = rcu_dereference(xfrm_policy_afinfo[family]); if (unlikely(!afinfo)) rcu_read_unlock(); return afinfo; } /* Called with rcu_read_lock(). */ static const struct xfrm_if_cb *xfrm_if_get_cb(void) { return rcu_dereference(xfrm_if_cb); } struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, int family, u32 mark) { const struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); rcu_read_unlock(); return dst; } EXPORT_SYMBOL(__xfrm_dst_lookup); static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, int family, u32 mark) { struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; xfrm_address_t *daddr = &x->id.daddr; struct dst_entry *dst; if (x->type->flags & XFRM_TYPE_LOCAL_COADDR) { saddr = x->coaddr; daddr = prev_daddr; } if (x->type->flags & XFRM_TYPE_REMOTE_COADDR) { saddr = prev_saddr; daddr = x->coaddr; } dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); if (!IS_ERR(dst)) { if (prev_saddr != saddr) memcpy(prev_saddr, saddr, sizeof(*prev_saddr)); if (prev_daddr != daddr) memcpy(prev_daddr, daddr, sizeof(*prev_daddr)); } return dst; } static inline unsigned long make_jiffies(long secs) { if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ) return MAX_SCHEDULE_TIMEOUT-1; else return secs*HZ; } static void xfrm_policy_timer(struct timer_list *t) { struct xfrm_policy *xp = from_timer(xp, t, timer); time64_t now = ktime_get_real_seconds(); time64_t next = TIME64_MAX; int warn = 0; int dir; read_lock(&xp->lock); if (unlikely(xp->walk.dead)) goto out; dir = xfrm_policy_id2dir(xp->index); if (xp->lft.hard_add_expires_seconds) { time64_t tmo = xp->lft.hard_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.hard_use_expires_seconds) { time64_t tmo = xp->lft.hard_use_expires_seconds + (xp->curlft.use_time ? : xp->curlft.add_time) - now; if (tmo <= 0) goto expired; if (tmo < next) next = tmo; } if (xp->lft.soft_add_expires_seconds) { time64_t tmo = xp->lft.soft_add_expires_seconds + xp->curlft.add_time - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (xp->lft.soft_use_expires_seconds) { time64_t tmo = xp->lft.soft_use_expires_seconds + (xp->curlft.use_time ? : xp->curlft.add_time) - now; if (tmo <= 0) { warn = 1; tmo = XFRM_KM_TIMEOUT; } if (tmo < next) next = tmo; } if (warn) km_policy_expired(xp, dir, 0, 0); if (next != TIME64_MAX && !mod_timer(&xp->timer, jiffies + make_jiffies(next))) xfrm_pol_hold(xp); out: read_unlock(&xp->lock); xfrm_pol_put(xp); return; expired: read_unlock(&xp->lock); if (!xfrm_policy_delete(xp, dir)) km_policy_expired(xp, dir, 1, 0); xfrm_pol_put(xp); } /* Allocate xfrm_policy. Not used here, it is supposed to be used by pfkeyv2 * SPD calls. */ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp) { struct xfrm_policy *policy; policy = kzalloc(sizeof(struct xfrm_policy), gfp); if (policy) { write_pnet(&policy->xp_net, net); INIT_LIST_HEAD(&policy->walk.all); INIT_HLIST_NODE(&policy->bydst_inexact_list); INIT_HLIST_NODE(&policy->bydst); INIT_HLIST_NODE(&policy->byidx); rwlock_init(&policy->lock); refcount_set(&policy->refcnt, 1); skb_queue_head_init(&policy->polq.hold_queue); timer_setup(&policy->timer, xfrm_policy_timer, 0); timer_setup(&policy->polq.hold_timer, xfrm_policy_queue_process, 0); } return policy; } EXPORT_SYMBOL(xfrm_policy_alloc); static void xfrm_policy_destroy_rcu(struct rcu_head *head) { struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu); security_xfrm_policy_free(policy->security); kfree(policy); } /* Destroy xfrm_policy: descendant resources must be released to this moment. */ void xfrm_policy_destroy(struct xfrm_policy *policy) { BUG_ON(!policy->walk.dead); if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer)) BUG(); call_rcu(&policy->rcu, xfrm_policy_destroy_rcu); } EXPORT_SYMBOL(xfrm_policy_destroy); /* Rule must be locked. Release descendant resources, announce * entry dead. The rule must be unlinked from lists to the moment. */ static void xfrm_policy_kill(struct xfrm_policy *policy) { write_lock_bh(&policy->lock); policy->walk.dead = 1; write_unlock_bh(&policy->lock); atomic_inc(&policy->genid); if (del_timer(&policy->polq.hold_timer)) xfrm_pol_put(policy); skb_queue_purge(&policy->polq.hold_queue); if (del_timer(&policy->timer)) xfrm_pol_put(policy); xfrm_pol_put(policy); } static unsigned int xfrm_policy_hashmax __read_mostly = 1 * 1024 * 1024; static inline unsigned int idx_hash(struct net *net, u32 index) { return __idx_hash(index, net->xfrm.policy_idx_hmask); } /* calculate policy hash thresholds */ static void __get_hash_thresh(struct net *net, unsigned short family, int dir, u8 *dbits, u8 *sbits) { switch (family) { case AF_INET: *dbits = net->xfrm.policy_bydst[dir].dbits4; *sbits = net->xfrm.policy_bydst[dir].sbits4; break; case AF_INET6: *dbits = net->xfrm.policy_bydst[dir].dbits6; *sbits = net->xfrm.policy_bydst[dir].sbits6; break; default: *dbits = 0; *sbits = 0; } } static struct hlist_head *policy_hash_bysel(struct net *net, const struct xfrm_selector *sel, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __sel_hash(sel, family, hmask, dbits, sbits); if (hash == hmask + 1) return NULL; return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static struct hlist_head *policy_hash_direct(struct net *net, const xfrm_address_t *daddr, const xfrm_address_t *saddr, unsigned short family, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int hash; u8 dbits; u8 sbits; __get_hash_thresh(net, family, dir, &dbits, &sbits); hash = __addr_hash(daddr, saddr, family, hmask, dbits, sbits); return rcu_dereference_check(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)) + hash; } static void xfrm_dst_hash_transfer(struct net *net, struct hlist_head *list, struct hlist_head *ndsttable, unsigned int nhashmask, int dir) { struct hlist_node *tmp, *entry0 = NULL; struct xfrm_policy *pol; unsigned int h0 = 0; u8 dbits; u8 sbits; redo: hlist_for_each_entry_safe(pol, tmp, list, bydst) { unsigned int h; __get_hash_thresh(net, pol->family, dir, &dbits, &sbits); h = __addr_hash(&pol->selector.daddr, &pol->selector.saddr, pol->family, nhashmask, dbits, sbits); if (!entry0) { hlist_del_rcu(&pol->bydst); hlist_add_head_rcu(&pol->bydst, ndsttable + h); h0 = h; } else { if (h != h0) continue; hlist_del_rcu(&pol->bydst); hlist_add_behind_rcu(&pol->bydst, entry0); } entry0 = &pol->bydst; } if (!hlist_empty(list)) { entry0 = NULL; goto redo; } } static void xfrm_idx_hash_transfer(struct hlist_head *list, struct hlist_head *nidxtable, unsigned int nhashmask) { struct hlist_node *tmp; struct xfrm_policy *pol; hlist_for_each_entry_safe(pol, tmp, list, byidx) { unsigned int h; h = __idx_hash(pol->index, nhashmask); hlist_add_head(&pol->byidx, nidxtable+h); } } static unsigned long xfrm_new_hash_mask(unsigned int old_hmask) { return ((old_hmask + 1) << 1) - 1; } static void xfrm_bydst_resize(struct net *net, int dir) { unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *ndst = xfrm_hash_alloc(nsize); struct hlist_head *odst; int i; if (!ndst) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); odst = rcu_dereference_protected(net->xfrm.policy_bydst[dir].table, lockdep_is_held(&net->xfrm.xfrm_policy_lock)); for (i = hmask; i >= 0; i--) xfrm_dst_hash_transfer(net, odst + i, ndst, nhashmask, dir); rcu_assign_pointer(net->xfrm.policy_bydst[dir].table, ndst); net->xfrm.policy_bydst[dir].hmask = nhashmask; write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); synchronize_rcu(); xfrm_hash_free(odst, (hmask + 1) * sizeof(struct hlist_head)); } static void xfrm_byidx_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; unsigned int nhashmask = xfrm_new_hash_mask(hmask); unsigned int nsize = (nhashmask + 1) * sizeof(struct hlist_head); struct hlist_head *oidx = net->xfrm.policy_byidx; struct hlist_head *nidx = xfrm_hash_alloc(nsize); int i; if (!nidx) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); for (i = hmask; i >= 0; i--) xfrm_idx_hash_transfer(oidx + i, nidx, nhashmask); net->xfrm.policy_byidx = nidx; net->xfrm.policy_idx_hmask = nhashmask; spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_hash_free(oidx, (hmask + 1) * sizeof(struct hlist_head)); } static inline int xfrm_bydst_should_resize(struct net *net, int dir, int *total) { unsigned int cnt = net->xfrm.policy_count[dir]; unsigned int hmask = net->xfrm.policy_bydst[dir].hmask; if (total) *total += cnt; if ((hmask + 1) < xfrm_policy_hashmax && cnt > hmask) return 1; return 0; } static inline int xfrm_byidx_should_resize(struct net *net, int total) { unsigned int hmask = net->xfrm.policy_idx_hmask; if ((hmask + 1) < xfrm_policy_hashmax && total > hmask) return 1; return 0; } void xfrm_spd_getinfo(struct net *net, struct xfrmk_spdinfo *si) { si->incnt = net->xfrm.policy_count[XFRM_POLICY_IN]; si->outcnt = net->xfrm.policy_count[XFRM_POLICY_OUT]; si->fwdcnt = net->xfrm.policy_count[XFRM_POLICY_FWD]; si->inscnt = net->xfrm.policy_count[XFRM_POLICY_IN+XFRM_POLICY_MAX]; si->outscnt = net->xfrm.policy_count[XFRM_POLICY_OUT+XFRM_POLICY_MAX]; si->fwdscnt = net->xfrm.policy_count[XFRM_POLICY_FWD+XFRM_POLICY_MAX]; si->spdhcnt = net->xfrm.policy_idx_hmask; si->spdhmcnt = xfrm_policy_hashmax; } EXPORT_SYMBOL(xfrm_spd_getinfo); static DEFINE_MUTEX(hash_resize_mutex); static void xfrm_hash_resize(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hash_work); int dir, total; mutex_lock(&hash_resize_mutex); total = 0; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { if (xfrm_bydst_should_resize(net, dir, &total)) xfrm_bydst_resize(net, dir); } if (xfrm_byidx_should_resize(net, total)) xfrm_byidx_resize(net, total); mutex_unlock(&hash_resize_mutex); } /* Make sure *pol can be inserted into fastbin. * Useful to check that later insert requests will be sucessful * (provided xfrm_policy_lock is held throughout). */ static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_alloc_bin(const struct xfrm_policy *pol, u8 dir) { struct xfrm_pol_inexact_bin *bin, *prev; struct xfrm_pol_inexact_key k = { .family = pol->family, .type = pol->type, .dir = dir, .if_id = pol->if_id, }; struct net *net = xp_net(pol); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); write_pnet(&k.net, net); bin = rhashtable_lookup_fast(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); if (bin) return bin; bin = kzalloc(sizeof(*bin), GFP_ATOMIC); if (!bin) return NULL; bin->k = k; INIT_HLIST_HEAD(&bin->hhead); bin->root_d = RB_ROOT; bin->root_s = RB_ROOT; seqcount_spinlock_init(&bin->count, &net->xfrm.xfrm_policy_lock); prev = rhashtable_lookup_get_insert_key(&xfrm_policy_inexact_table, &bin->k, &bin->head, xfrm_pol_inexact_params); if (!prev) { list_add(&bin->inexact_bins, &net->xfrm.inexact_bins); return bin; } kfree(bin); return IS_ERR(prev) ? NULL : prev; } static bool xfrm_pol_inexact_addr_use_any_list(const xfrm_address_t *addr, int family, u8 prefixlen) { if (xfrm_addr_any(addr, family)) return true; if (family == AF_INET6 && prefixlen < INEXACT_PREFIXLEN_IPV6) return true; if (family == AF_INET && prefixlen < INEXACT_PREFIXLEN_IPV4) return true; return false; } static bool xfrm_policy_inexact_insert_use_any_list(const struct xfrm_policy *policy) { const xfrm_address_t *addr; bool saddr_any, daddr_any; u8 prefixlen; addr = &policy->selector.saddr; prefixlen = policy->selector.prefixlen_s; saddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); addr = &policy->selector.daddr; prefixlen = policy->selector.prefixlen_d; daddr_any = xfrm_pol_inexact_addr_use_any_list(addr, policy->family, prefixlen); return saddr_any && daddr_any; } static void xfrm_pol_inexact_node_init(struct xfrm_pol_inexact_node *node, const xfrm_address_t *addr, u8 prefixlen) { node->addr = *addr; node->prefixlen = prefixlen; } static struct xfrm_pol_inexact_node * xfrm_pol_inexact_node_alloc(const xfrm_address_t *addr, u8 prefixlen) { struct xfrm_pol_inexact_node *node; node = kzalloc(sizeof(*node), GFP_ATOMIC); if (node) xfrm_pol_inexact_node_init(node, addr, prefixlen); return node; } static int xfrm_policy_addr_delta(const xfrm_address_t *a, const xfrm_address_t *b, u8 prefixlen, u16 family) { u32 ma, mb, mask; unsigned int pdw, pbi; int delta = 0; switch (family) { case AF_INET: if (prefixlen == 0) return 0; mask = ~0U << (32 - prefixlen); ma = ntohl(a->a4) & mask; mb = ntohl(b->a4) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; break; case AF_INET6: pdw = prefixlen >> 5; pbi = prefixlen & 0x1f; if (pdw) { delta = memcmp(a->a6, b->a6, pdw << 2); if (delta) return delta; } if (pbi) { mask = ~0U << (32 - pbi); ma = ntohl(a->a6[pdw]) & mask; mb = ntohl(b->a6[pdw]) & mask; if (ma < mb) delta = -1; else if (ma > mb) delta = 1; } break; default: break; } return delta; } static void xfrm_policy_inexact_list_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, u16 family) { unsigned int matched_s, matched_d; struct xfrm_policy *policy, *p; matched_s = 0; matched_d = 0; list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { struct hlist_node *newpos = NULL; bool matches_s, matches_d; if (!policy->bydst_reinsert) continue; WARN_ON_ONCE(policy->family != family); policy->bydst_reinsert = false; hlist_for_each_entry(p, &n->hhead, bydst) { if (policy->priority > p->priority) newpos = &p->bydst; else if (policy->priority == p->priority && policy->pos > p->pos) newpos = &p->bydst; else break; } if (newpos) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, &n->hhead); /* paranoia checks follow. * Check that the reinserted policy matches at least * saddr or daddr for current node prefix. * * Matching both is fine, matching saddr in one policy * (but not daddr) and then matching only daddr in another * is a bug. */ matches_s = xfrm_policy_addr_delta(&policy->selector.saddr, &n->addr, n->prefixlen, family) == 0; matches_d = xfrm_policy_addr_delta(&policy->selector.daddr, &n->addr, n->prefixlen, family) == 0; if (matches_s && matches_d) continue; WARN_ON_ONCE(!matches_s && !matches_d); if (matches_s) matched_s++; if (matches_d) matched_d++; WARN_ON_ONCE(matched_s && matched_d); } } static void xfrm_policy_inexact_node_reinsert(struct net *net, struct xfrm_pol_inexact_node *n, struct rb_root *new, u16 family) { struct xfrm_pol_inexact_node *node; struct rb_node **p, *parent; /* we should not have another subtree here */ WARN_ON_ONCE(!RB_EMPTY_ROOT(&n->root)); restart: parent = NULL; p = &new->rb_node; while (*p) { u8 prefixlen; int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); prefixlen = min(node->prefixlen, n->prefixlen); delta = xfrm_policy_addr_delta(&n->addr, &node->addr, prefixlen, family); if (delta < 0) { p = &parent->rb_left; } else if (delta > 0) { p = &parent->rb_right; } else { bool same_prefixlen = node->prefixlen == n->prefixlen; struct xfrm_policy *tmp; hlist_for_each_entry(tmp, &n->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } node->prefixlen = prefixlen; xfrm_policy_inexact_list_reinsert(net, node, family); if (same_prefixlen) { kfree_rcu(n, rcu); return; } rb_erase(*p, new); kfree_rcu(n, rcu); n = node; goto restart; } } rb_link_node_rcu(&n->node, parent, p); rb_insert_color(&n->node, new); } /* merge nodes v and n */ static void xfrm_policy_inexact_node_merge(struct net *net, struct xfrm_pol_inexact_node *v, struct xfrm_pol_inexact_node *n, u16 family) { struct xfrm_pol_inexact_node *node; struct xfrm_policy *tmp; struct rb_node *rnode; /* To-be-merged node v has a subtree. * * Dismantle it and insert its nodes to n->root. */ while ((rnode = rb_first(&v->root)) != NULL) { node = rb_entry(rnode, struct xfrm_pol_inexact_node, node); rb_erase(&node->node, &v->root); xfrm_policy_inexact_node_reinsert(net, node, &n->root, family); } hlist_for_each_entry(tmp, &v->hhead, bydst) { tmp->bydst_reinsert = true; hlist_del_rcu(&tmp->bydst); } xfrm_policy_inexact_list_reinsert(net, n, family); } static struct xfrm_pol_inexact_node * xfrm_policy_inexact_insert_node(struct net *net, struct rb_root *root, xfrm_address_t *addr, u16 family, u8 prefixlen, u8 dir) { struct xfrm_pol_inexact_node *cached = NULL; struct rb_node **p, *parent = NULL; struct xfrm_pol_inexact_node *node; p = &root->rb_node; while (*p) { int delta; parent = *p; node = rb_entry(*p, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta == 0 && prefixlen >= node->prefixlen) { WARN_ON_ONCE(cached); /* ipsec policies got lost */ return node; } if (delta < 0) p = &parent->rb_left; else p = &parent->rb_right; if (prefixlen < node->prefixlen) { delta = xfrm_policy_addr_delta(addr, &node->addr, prefixlen, family); if (delta) continue; /* This node is a subnet of the new prefix. It needs * to be removed and re-inserted with the smaller * prefix and all nodes that are now also covered * by the reduced prefixlen. */ rb_erase(&node->node, root); if (!cached) { xfrm_pol_inexact_node_init(node, addr, prefixlen); cached = node; } else { /* This node also falls within the new * prefixlen. Merge the to-be-reinserted * node and this one. */ xfrm_policy_inexact_node_merge(net, node, cached, family); kfree_rcu(node, rcu); } /* restart */ p = &root->rb_node; parent = NULL; } } node = cached; if (!node) { node = xfrm_pol_inexact_node_alloc(addr, prefixlen); if (!node) return NULL; } rb_link_node_rcu(&node->node, parent, p); rb_insert_color(&node->node, root); return node; } static void xfrm_policy_inexact_gc_tree(struct rb_root *r, bool rm) { struct xfrm_pol_inexact_node *node; struct rb_node *rn = rb_first(r); while (rn) { node = rb_entry(rn, struct xfrm_pol_inexact_node, node); xfrm_policy_inexact_gc_tree(&node->root, rm); rn = rb_next(rn); if (!hlist_empty(&node->hhead) || !RB_EMPTY_ROOT(&node->root)) { WARN_ON_ONCE(rm); continue; } rb_erase(&node->node, r); kfree_rcu(node, rcu); } } static void __xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b, bool net_exit) { write_seqcount_begin(&b->count); xfrm_policy_inexact_gc_tree(&b->root_d, net_exit); xfrm_policy_inexact_gc_tree(&b->root_s, net_exit); write_seqcount_end(&b->count); if (!RB_EMPTY_ROOT(&b->root_d) || !RB_EMPTY_ROOT(&b->root_s) || !hlist_empty(&b->hhead)) { WARN_ON_ONCE(net_exit); return; } if (rhashtable_remove_fast(&xfrm_policy_inexact_table, &b->head, xfrm_pol_inexact_params) == 0) { list_del(&b->inexact_bins); kfree_rcu(b, rcu); } } static void xfrm_policy_inexact_prune_bin(struct xfrm_pol_inexact_bin *b) { struct net *net = read_pnet(&b->k.net); spin_lock_bh(&net->xfrm.xfrm_policy_lock); __xfrm_policy_inexact_prune_bin(b, false); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static void __xfrm_policy_inexact_flush(struct net *net) { struct xfrm_pol_inexact_bin *bin, *t; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(bin, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(bin, false); } static struct hlist_head * xfrm_policy_inexact_alloc_chain(struct xfrm_pol_inexact_bin *bin, struct xfrm_policy *policy, u8 dir) { struct xfrm_pol_inexact_node *n; struct net *net; net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); if (xfrm_policy_inexact_insert_use_any_list(policy)) return &bin->hhead; if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.daddr, policy->family, policy->selector.prefixlen_d)) { write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_s, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } /* daddr is fixed */ write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &bin->root_d, &policy->selector.daddr, policy->family, policy->selector.prefixlen_d, dir); write_seqcount_end(&bin->count); if (!n) return NULL; /* saddr is wildcard */ if (xfrm_pol_inexact_addr_use_any_list(&policy->selector.saddr, policy->family, policy->selector.prefixlen_s)) return &n->hhead; write_seqcount_begin(&bin->count); n = xfrm_policy_inexact_insert_node(net, &n->root, &policy->selector.saddr, policy->family, policy->selector.prefixlen_s, dir); write_seqcount_end(&bin->count); if (!n) return NULL; return &n->hhead; } static struct xfrm_policy * xfrm_policy_inexact_insert(struct xfrm_policy *policy, u8 dir, int excl) { struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *delpol; struct hlist_head *chain; struct net *net; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) return ERR_PTR(-ENOMEM); net = xp_net(policy); lockdep_assert_held(&net->xfrm.xfrm_policy_lock); chain = xfrm_policy_inexact_alloc_chain(bin, policy, dir); if (!chain) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-ENOMEM); } delpol = xfrm_policy_insert_list(chain, policy, excl); if (delpol && excl) { __xfrm_policy_inexact_prune_bin(bin, false); return ERR_PTR(-EEXIST); } chain = &net->xfrm.policy_inexact[dir]; xfrm_policy_insert_inexact_list(chain, policy); if (delpol) __xfrm_policy_inexact_prune_bin(bin, false); return delpol; } static void xfrm_hash_rebuild(struct work_struct *work) { struct net *net = container_of(work, struct net, xfrm.policy_hthresh.work); unsigned int hmask; struct xfrm_policy *pol; struct xfrm_policy *policy; struct hlist_head *chain; struct hlist_head *odst; struct hlist_node *newpos; int i; int dir; unsigned seq; u8 lbits4, rbits4, lbits6, rbits6; mutex_lock(&hash_resize_mutex); /* read selector prefixlen thresholds */ do { seq = read_seqbegin(&net->xfrm.policy_hthresh.lock); lbits4 = net->xfrm.policy_hthresh.lbits4; rbits4 = net->xfrm.policy_hthresh.rbits4; lbits6 = net->xfrm.policy_hthresh.lbits6; rbits6 = net->xfrm.policy_hthresh.rbits6; } while (read_seqretry(&net->xfrm.policy_hthresh.lock, seq)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); write_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); /* make sure that we can insert the indirect policies again before * we start with destructive action. */ list_for_each_entry(policy, &net->xfrm.policy_all, walk.all) { struct xfrm_pol_inexact_bin *bin; u8 dbits, sbits; dir = xfrm_policy_id2dir(policy->index); if (policy->walk.dead || dir >= XFRM_POLICY_MAX) continue; if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { if (policy->family == AF_INET) { dbits = rbits4; sbits = lbits4; } else { dbits = rbits6; sbits = lbits6; } } else { if (policy->family == AF_INET) { dbits = lbits4; sbits = rbits4; } else { dbits = lbits6; sbits = rbits6; } } if (policy->selector.prefixlen_d < dbits || policy->selector.prefixlen_s < sbits) continue; bin = xfrm_policy_inexact_alloc_bin(policy, dir); if (!bin) goto out_unlock; if (!xfrm_policy_inexact_alloc_chain(bin, policy, dir)) goto out_unlock; } /* reset the bydst and inexact table in all directions */ for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct hlist_node *n; hlist_for_each_entry_safe(policy, n, &net->xfrm.policy_inexact[dir], bydst_inexact_list) { hlist_del_rcu(&policy->bydst); hlist_del_init(&policy->bydst_inexact_list); } hmask = net->xfrm.policy_bydst[dir].hmask; odst = net->xfrm.policy_bydst[dir].table; for (i = hmask; i >= 0; i--) { hlist_for_each_entry_safe(policy, n, odst + i, bydst) hlist_del_rcu(&policy->bydst); } if ((dir & XFRM_POLICY_MASK) == XFRM_POLICY_OUT) { /* dir out => dst = remote, src = local */ net->xfrm.policy_bydst[dir].dbits4 = rbits4; net->xfrm.policy_bydst[dir].sbits4 = lbits4; net->xfrm.policy_bydst[dir].dbits6 = rbits6; net->xfrm.policy_bydst[dir].sbits6 = lbits6; } else { /* dir in/fwd => dst = local, src = remote */ net->xfrm.policy_bydst[dir].dbits4 = lbits4; net->xfrm.policy_bydst[dir].sbits4 = rbits4; net->xfrm.policy_bydst[dir].dbits6 = lbits6; net->xfrm.policy_bydst[dir].sbits6 = rbits6; } } /* re-insert all policies by order of creation */ list_for_each_entry_reverse(policy, &net->xfrm.policy_all, walk.all) { if (policy->walk.dead) continue; dir = xfrm_policy_id2dir(policy->index); if (dir >= XFRM_POLICY_MAX) { /* skip socket policies */ continue; } newpos = NULL; chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (!chain) { void *p = xfrm_policy_inexact_insert(policy, dir, 0); WARN_ONCE(IS_ERR(p), "reinsert: %ld\n", PTR_ERR(p)); continue; } hlist_for_each_entry(pol, chain, bydst) { if (policy->priority >= pol->priority) newpos = &pol->bydst; else break; } if (newpos) hlist_add_behind_rcu(&policy->bydst, newpos); else hlist_add_head_rcu(&policy->bydst, chain); } out_unlock: __xfrm_policy_inexact_flush(net); write_seqcount_end(&net->xfrm.xfrm_policy_hash_generation); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); mutex_unlock(&hash_resize_mutex); } void xfrm_policy_hash_rebuild(struct net *net) { schedule_work(&net->xfrm.policy_hthresh.work); } EXPORT_SYMBOL(xfrm_policy_hash_rebuild); /* Generate new index... KAME seems to generate them ordered by cost * of an absolute inpredictability of ordering of rules. This will not pass. */ static u32 xfrm_gen_index(struct net *net, int dir, u32 index) { static u32 idx_generator; for (;;) { struct hlist_head *list; struct xfrm_policy *p; u32 idx; int found; if (!index) { idx = (idx_generator | dir); idx_generator += 8; } else { idx = index; index = 0; } if (idx == 0) idx = 8; list = net->xfrm.policy_byidx + idx_hash(net, idx); found = 0; hlist_for_each_entry(p, list, byidx) { if (p->index == idx) { found = 1; break; } } if (!found) return idx; } } static inline int selector_cmp(struct xfrm_selector *s1, struct xfrm_selector *s2) { u32 *p1 = (u32 *) s1; u32 *p2 = (u32 *) s2; int len = sizeof(struct xfrm_selector) / sizeof(u32); int i; for (i = 0; i < len; i++) { if (p1[i] != p2[i]) return 1; } return 0; } static void xfrm_policy_requeue(struct xfrm_policy *old, struct xfrm_policy *new) { struct xfrm_policy_queue *pq = &old->polq; struct sk_buff_head list; if (skb_queue_empty(&pq->hold_queue)) return; __skb_queue_head_init(&list); spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice_init(&pq->hold_queue, &list); if (del_timer(&pq->hold_timer)) xfrm_pol_put(old); spin_unlock_bh(&pq->hold_queue.lock); pq = &new->polq; spin_lock_bh(&pq->hold_queue.lock); skb_queue_splice(&list, &pq->hold_queue); pq->timeout = XFRM_QUEUE_TMO_MIN; if (!mod_timer(&pq->hold_timer, jiffies)) xfrm_pol_hold(new); spin_unlock_bh(&pq->hold_queue.lock); } static inline bool xfrm_policy_mark_match(const struct xfrm_mark *mark, struct xfrm_policy *pol) { return mark->v == pol->mark.v && mark->m == pol->mark.m; } static u32 xfrm_pol_bin_key(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_key *k = data; u32 a = k->type << 24 | k->dir << 16 | k->family; return jhash_3words(a, k->if_id, net_hash_mix(read_pnet(&k->net)), seed); } static u32 xfrm_pol_bin_obj(const void *data, u32 len, u32 seed) { const struct xfrm_pol_inexact_bin *b = data; return xfrm_pol_bin_key(&b->k, 0, seed); } static int xfrm_pol_bin_cmp(struct rhashtable_compare_arg *arg, const void *ptr) { const struct xfrm_pol_inexact_key *key = arg->key; const struct xfrm_pol_inexact_bin *b = ptr; int ret; if (!net_eq(read_pnet(&b->k.net), read_pnet(&key->net))) return -1; ret = b->k.dir ^ key->dir; if (ret) return ret; ret = b->k.type ^ key->type; if (ret) return ret; ret = b->k.family ^ key->family; if (ret) return ret; return b->k.if_id ^ key->if_id; } static const struct rhashtable_params xfrm_pol_inexact_params = { .head_offset = offsetof(struct xfrm_pol_inexact_bin, head), .hashfn = xfrm_pol_bin_key, .obj_hashfn = xfrm_pol_bin_obj, .obj_cmpfn = xfrm_pol_bin_cmp, .automatic_shrinking = true, }; static void xfrm_policy_insert_inexact_list(struct hlist_head *chain, struct xfrm_policy *policy) { struct xfrm_policy *pol, *delpol = NULL; struct hlist_node *newpos = NULL; int i = 0; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = &pol->bydst_inexact_list; continue; } if (delpol) break; } if (newpos) hlist_add_behind_rcu(&policy->bydst_inexact_list, newpos); else hlist_add_head_rcu(&policy->bydst_inexact_list, chain); hlist_for_each_entry(pol, chain, bydst_inexact_list) { pol->pos = i; i++; } } static struct xfrm_policy *xfrm_policy_insert_list(struct hlist_head *chain, struct xfrm_policy *policy, bool excl) { struct xfrm_policy *pol, *newpos = NULL, *delpol = NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == policy->type && pol->if_id == policy->if_id && !selector_cmp(&pol->selector, &policy->selector) && xfrm_policy_mark_match(&policy->mark, pol) && xfrm_sec_ctx_match(pol->security, policy->security) && !WARN_ON(delpol)) { if (excl) return ERR_PTR(-EEXIST); delpol = pol; if (policy->priority > pol->priority) continue; } else if (policy->priority >= pol->priority) { newpos = pol; continue; } if (delpol) break; } if (newpos) hlist_add_behind_rcu(&policy->bydst, &newpos->bydst); else hlist_add_head_rcu(&policy->bydst, chain); return delpol; } int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) { struct net *net = xp_net(policy); struct xfrm_policy *delpol; struct hlist_head *chain; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) delpol = xfrm_policy_insert_list(chain, policy, excl); else delpol = xfrm_policy_inexact_insert(policy, dir, excl); if (IS_ERR(delpol)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return PTR_ERR(delpol); } __xfrm_policy_link(policy, dir); /* After previous checking, family can either be AF_INET or AF_INET6 */ if (policy->family == AF_INET) rt_genid_bump_ipv4(net); else rt_genid_bump_ipv6(net); if (delpol) { xfrm_policy_requeue(delpol, policy); __xfrm_policy_unlink(delpol, dir); } policy->index = delpol ? delpol->index : xfrm_gen_index(net, dir, policy->index); hlist_add_head(&policy->byidx, net->xfrm.policy_byidx+idx_hash(net, policy->index)); policy->curlft.add_time = ktime_get_real_seconds(); policy->curlft.use_time = 0; if (!mod_timer(&policy->timer, jiffies + HZ)) xfrm_pol_hold(policy); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (delpol) xfrm_policy_kill(delpol); else if (xfrm_bydst_should_resize(net, dir, NULL)) schedule_work(&net->xfrm.policy_hash_work); return 0; } EXPORT_SYMBOL(xfrm_policy_insert); static struct xfrm_policy * __xfrm_policy_bysel_ctx(struct hlist_head *chain, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx) { struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry(pol, chain, bydst) { if (pol->type == type && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol) && !selector_cmp(sel, &pol->selector) && xfrm_sec_ctx_match(ctx, pol->security)) return pol; } return NULL; } struct xfrm_policy * xfrm_policy_bysel_ctx(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, struct xfrm_selector *sel, struct xfrm_sec_ctx *ctx, int delete, int *err) { struct xfrm_pol_inexact_bin *bin = NULL; struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, sel, sel->family, dir); if (!chain) { struct xfrm_pol_inexact_candidates cand; int i; bin = xfrm_policy_inexact_lookup(net, type, sel->family, dir, if_id); if (!bin) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } if (!xfrm_policy_find_inexact_candidates(&cand, bin, &sel->saddr, &sel->daddr)) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return NULL; } pol = NULL; for (i = 0; i < ARRAY_SIZE(cand.res); i++) { struct xfrm_policy *tmp; tmp = __xfrm_policy_bysel_ctx(cand.res[i], mark, if_id, type, dir, sel, ctx); if (!tmp) continue; if (!pol || tmp->pos < pol->pos) pol = tmp; } } else { pol = __xfrm_policy_bysel_ctx(chain, mark, if_id, type, dir, sel, ctx); } if (pol) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete(pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); if (bin && delete) xfrm_policy_inexact_prune_bin(bin); return ret; } EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy * xfrm_policy_byid(struct net *net, const struct xfrm_mark *mark, u32 if_id, u8 type, int dir, u32 id, int delete, int *err) { struct xfrm_policy *pol, *ret; struct hlist_head *chain; *err = -ENOENT; if (xfrm_policy_id2dir(id) != dir) return NULL; *err = 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = net->xfrm.policy_byidx + idx_hash(net, id); ret = NULL; hlist_for_each_entry(pol, chain, byidx) { if (pol->type == type && pol->index == id && pol->if_id == if_id && xfrm_policy_mark_match(mark, pol)) { xfrm_pol_hold(pol); if (delete) { *err = security_xfrm_policy_delete( pol->security); if (*err) { spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return pol; } __xfrm_policy_unlink(pol, dir); } ret = pol; break; } } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (ret && delete) xfrm_policy_kill(ret); return ret; } EXPORT_SYMBOL(xfrm_policy_byid); #ifdef CONFIG_SECURITY_NETWORK_XFRM static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { struct xfrm_policy *pol; int err = 0; list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { if (pol->walk.dead || xfrm_policy_id2dir(pol->index) >= XFRM_POLICY_MAX || pol->type != type) continue; err = security_xfrm_policy_delete(pol->security); if (err) { xfrm_audit_policy_delete(pol, 0, task_valid); return err; } } return err; } #else static inline int xfrm_policy_flush_secctx_check(struct net *net, u8 type, bool task_valid) { return 0; } #endif int xfrm_policy_flush(struct net *net, u8 type, bool task_valid) { int dir, err = 0, cnt = 0; struct xfrm_policy *pol; spin_lock_bh(&net->xfrm.xfrm_policy_lock); err = xfrm_policy_flush_secctx_check(net, type, task_valid); if (err) goto out; again: list_for_each_entry(pol, &net->xfrm.policy_all, walk.all) { dir = xfrm_policy_id2dir(pol->index); if (pol->walk.dead || dir >= XFRM_POLICY_MAX || pol->type != type) continue; __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); cnt++; xfrm_audit_policy_delete(pol, 1, task_valid); xfrm_policy_kill(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); goto again; } if (cnt) __xfrm_policy_inexact_flush(net); else err = -ESRCH; out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return err; } EXPORT_SYMBOL(xfrm_policy_flush); int xfrm_policy_walk(struct net *net, struct xfrm_policy_walk *walk, int (*func)(struct xfrm_policy *, int, int, void*), void *data) { struct xfrm_policy *pol; struct xfrm_policy_walk_entry *x; int error = 0; if (walk->type >= XFRM_POLICY_TYPE_MAX && walk->type != XFRM_POLICY_TYPE_ANY) return -EINVAL; if (list_empty(&walk->walk.all) && walk->seq != 0) return 0; spin_lock_bh(&net->xfrm.xfrm_policy_lock); if (list_empty(&walk->walk.all)) x = list_first_entry(&net->xfrm.policy_all, struct xfrm_policy_walk_entry, all); else x = list_first_entry(&walk->walk.all, struct xfrm_policy_walk_entry, all); list_for_each_entry_from(x, &net->xfrm.policy_all, all) { if (x->dead) continue; pol = container_of(x, struct xfrm_policy, walk); if (walk->type != XFRM_POLICY_TYPE_ANY && walk->type != pol->type) continue; error = func(pol, xfrm_policy_id2dir(pol->index), walk->seq, data); if (error) { list_move_tail(&walk->walk.all, &x->all); goto out; } walk->seq++; } if (walk->seq == 0) { error = -ENOENT; goto out; } list_del_init(&walk->walk.all); out: spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return error; } EXPORT_SYMBOL(xfrm_policy_walk); void xfrm_policy_walk_init(struct xfrm_policy_walk *walk, u8 type) { INIT_LIST_HEAD(&walk->walk.all); walk->walk.dead = 1; walk->type = type; walk->seq = 0; } EXPORT_SYMBOL(xfrm_policy_walk_init); void xfrm_policy_walk_done(struct xfrm_policy_walk *walk, struct net *net) { if (list_empty(&walk->walk.all)) return; spin_lock_bh(&net->xfrm.xfrm_policy_lock); /*FIXME where is net? */ list_del(&walk->walk.all); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } EXPORT_SYMBOL(xfrm_policy_walk_done); /* * Find policy to apply to this flow. * * Returns 0 if policy found, else an -errno. */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, u8 type, u16 family, int dir, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; bool match; if (pol->family != family || pol->if_id != if_id || (fl->flowi_mark & pol->mark.m) != pol->mark.v || pol->type != type) return ret; match = xfrm_selector_match(sel, fl, family); if (match) ret = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, dir); return ret; } static struct xfrm_pol_inexact_node * xfrm_policy_lookup_inexact_addr(const struct rb_root *r, seqcount_spinlock_t *count, const xfrm_address_t *addr, u16 family) { const struct rb_node *parent; int seq; again: seq = read_seqcount_begin(count); parent = rcu_dereference_raw(r->rb_node); while (parent) { struct xfrm_pol_inexact_node *node; int delta; node = rb_entry(parent, struct xfrm_pol_inexact_node, node); delta = xfrm_policy_addr_delta(addr, &node->addr, node->prefixlen, family); if (delta < 0) { parent = rcu_dereference_raw(parent->rb_left); continue; } else if (delta > 0) { parent = rcu_dereference_raw(parent->rb_right); continue; } return node; } if (read_seqcount_retry(count, seq)) goto again; return NULL; } static bool xfrm_policy_find_inexact_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_pol_inexact_bin *b, const xfrm_address_t *saddr, const xfrm_address_t *daddr) { struct xfrm_pol_inexact_node *n; u16 family; if (!b) return false; family = b->k.family; memset(cand, 0, sizeof(*cand)); cand->res[XFRM_POL_CAND_ANY] = &b->hhead; n = xfrm_policy_lookup_inexact_addr(&b->root_d, &b->count, daddr, family); if (n) { cand->res[XFRM_POL_CAND_DADDR] = &n->hhead; n = xfrm_policy_lookup_inexact_addr(&n->root, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_BOTH] = &n->hhead; } n = xfrm_policy_lookup_inexact_addr(&b->root_s, &b->count, saddr, family); if (n) cand->res[XFRM_POL_CAND_SADDR] = &n->hhead; return true; } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup_rcu(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_key k = { .family = family, .type = type, .dir = dir, .if_id = if_id, }; write_pnet(&k.net, net); return rhashtable_lookup(&xfrm_policy_inexact_table, &k, xfrm_pol_inexact_params); } static struct xfrm_pol_inexact_bin * xfrm_policy_inexact_lookup(struct net *net, u8 type, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_bin *bin; lockdep_assert_held(&net->xfrm.xfrm_policy_lock); rcu_read_lock(); bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); rcu_read_unlock(); return bin; } static struct xfrm_policy * __xfrm_policy_eval_candidates(struct hlist_head *chain, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, int dir, u32 if_id) { u32 priority = prefer ? prefer->priority : ~0u; struct xfrm_policy *pol; if (!chain) return NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { int err; if (pol->priority > priority) break; err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err != -ESRCH) return ERR_PTR(err); continue; } if (prefer) { /* matches. Is it older than *prefer? */ if (pol->priority == priority && prefer->pos < pol->pos) return prefer; } return pol; } return NULL; } static struct xfrm_policy * xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_policy *prefer, const struct flowi *fl, u8 type, u16 family, int dir, u32 if_id) { struct xfrm_policy *tmp; int i; for (i = 0; i < ARRAY_SIZE(cand->res); i++) { tmp = __xfrm_policy_eval_candidates(cand->res[i], prefer, fl, type, family, dir, if_id); if (!tmp) continue; if (IS_ERR(tmp)) return tmp; prefer = tmp; } return prefer; } static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { struct xfrm_pol_inexact_candidates cand; const xfrm_address_t *daddr, *saddr; struct xfrm_pol_inexact_bin *bin; struct xfrm_policy *pol, *ret; struct hlist_head *chain; unsigned int sequence; int err; daddr = xfrm_flowi_daddr(fl, family); saddr = xfrm_flowi_saddr(fl, family); if (unlikely(!daddr || !saddr)) return NULL; rcu_read_lock(); retry: do { sequence = read_seqcount_begin(&net->xfrm.xfrm_policy_hash_generation); chain = policy_hash_direct(net, daddr, saddr, family, dir); } while (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)); ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { err = xfrm_policy_match(pol, fl, type, family, dir, if_id); if (err) { if (err == -ESRCH) continue; else { ret = ERR_PTR(err); goto fail; } } else { ret = pol; break; } } bin = xfrm_policy_inexact_lookup_rcu(net, type, family, dir, if_id); if (!bin || !xfrm_policy_find_inexact_candidates(&cand, bin, saddr, daddr)) goto skip_inexact; pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, family, dir, if_id); if (pol) { ret = pol; if (IS_ERR(pol)) goto fail; } skip_inexact: if (read_seqcount_retry(&net->xfrm.xfrm_policy_hash_generation, sequence)) goto retry; if (ret && !xfrm_pol_hold_rcu(ret)) goto retry; fail: rcu_read_unlock(); return ret; } static struct xfrm_policy *xfrm_policy_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, u32 if_id) { #ifdef CONFIG_XFRM_SUB_POLICY struct xfrm_policy *pol; pol = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_SUB, fl, family, dir, if_id); if (pol != NULL) return pol; #endif return xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, fl, family, dir, if_id); } static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir, const struct flowi *fl, u16 family, u32 if_id) { struct xfrm_policy *pol; rcu_read_lock(); again: pol = rcu_dereference(sk->sk_policy[dir]); if (pol != NULL) { bool match; int err = 0; if (pol->family != family) { pol = NULL; goto out; } match = xfrm_selector_match(&pol->selector, fl, family); if (match) { if ((sk->sk_mark & pol->mark.m) != pol->mark.v || pol->if_id != if_id) { pol = NULL; goto out; } err = security_xfrm_policy_lookup(pol->security, fl->flowi_secid, dir); if (!err) { if (!xfrm_pol_hold_rcu(pol)) goto again; } else if (err == -ESRCH) { pol = NULL; } else { pol = ERR_PTR(err); } } else pol = NULL; } out: rcu_read_unlock(); return pol; } static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); list_add(&pol->walk.all, &net->xfrm.policy_all); net->xfrm.policy_count[dir]++; xfrm_pol_hold(pol); } static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); if (list_empty(&pol->walk.all)) return NULL; /* Socket policies are not hashed. */ if (!hlist_unhashed(&pol->bydst)) { hlist_del_rcu(&pol->bydst); hlist_del_init(&pol->bydst_inexact_list); hlist_del(&pol->byidx); } list_del_init(&pol->walk.all); net->xfrm.policy_count[dir]--; return pol; } static void xfrm_sk_policy_link(struct xfrm_policy *pol, int dir) { __xfrm_policy_link(pol, XFRM_POLICY_MAX + dir); } static void xfrm_sk_policy_unlink(struct xfrm_policy *pol, int dir) { __xfrm_policy_unlink(pol, XFRM_POLICY_MAX + dir); } int xfrm_policy_delete(struct xfrm_policy *pol, int dir) { struct net *net = xp_net(pol); spin_lock_bh(&net->xfrm.xfrm_policy_lock); pol = __xfrm_policy_unlink(pol, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (pol) { xfrm_policy_kill(pol); return 0; } return -ENOENT; } EXPORT_SYMBOL(xfrm_policy_delete); int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol) { struct net *net = sock_net(sk); struct xfrm_policy *old_pol; #ifdef CONFIG_XFRM_SUB_POLICY if (pol && pol->type != XFRM_POLICY_TYPE_MAIN) return -EINVAL; #endif spin_lock_bh(&net->xfrm.xfrm_policy_lock); old_pol = rcu_dereference_protected(sk->sk_policy[dir], lockdep_is_held(&net->xfrm.xfrm_policy_lock)); if (pol) { pol->curlft.add_time = ktime_get_real_seconds(); pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0); xfrm_sk_policy_link(pol, dir); } rcu_assign_pointer(sk->sk_policy[dir], pol); if (old_pol) { if (pol) xfrm_policy_requeue(old_pol, pol); /* Unlinking succeeds always. This is the only function * allowed to delete or replace socket policy. */ xfrm_sk_policy_unlink(old_pol, dir); } spin_unlock_bh(&net->xfrm.xfrm_policy_lock); if (old_pol) { xfrm_policy_kill(old_pol); } return 0; } static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir) { struct xfrm_policy *newp = xfrm_policy_alloc(xp_net(old), GFP_ATOMIC); struct net *net = xp_net(old); if (newp) { newp->selector = old->selector; if (security_xfrm_policy_clone(old->security, &newp->security)) { kfree(newp); return NULL; /* ENOMEM */ } newp->lft = old->lft; newp->curlft = old->curlft; newp->mark = old->mark; newp->if_id = old->if_id; newp->action = old->action; newp->flags = old->flags; newp->xfrm_nr = old->xfrm_nr; newp->index = old->index; newp->type = old->type; newp->family = old->family; memcpy(newp->xfrm_vec, old->xfrm_vec, newp->xfrm_nr*sizeof(struct xfrm_tmpl)); spin_lock_bh(&net->xfrm.xfrm_policy_lock); xfrm_sk_policy_link(newp, dir); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); xfrm_pol_put(newp); } return newp; } int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) { const struct xfrm_policy *p; struct xfrm_policy *np; int i, ret = 0; rcu_read_lock(); for (i = 0; i < 2; i++) { p = rcu_dereference(osk->sk_policy[i]); if (p) { np = clone_policy(p, i); if (unlikely(!np)) { ret = -ENOMEM; break; } rcu_assign_pointer(sk->sk_policy[i], np); } } rcu_read_unlock(); return ret; } static int xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, xfrm_address_t *remote, unsigned short family, u32 mark) { int err; const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; err = afinfo->get_saddr(net, oif, local, remote, mark); rcu_read_unlock(); return err; } /* Resolve list of templates for the flow, given policy. */ static int xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct net *net = xp_net(policy); int nx; int i, error; xfrm_address_t *daddr = xfrm_flowi_daddr(fl, family); xfrm_address_t *saddr = xfrm_flowi_saddr(fl, family); xfrm_address_t tmp; for (nx = 0, i = 0; i < policy->xfrm_nr; i++) { struct xfrm_state *x; xfrm_address_t *remote = daddr; xfrm_address_t *local = saddr; struct xfrm_tmpl *tmpl = &policy->xfrm_vec[i]; if (tmpl->mode == XFRM_MODE_TUNNEL || tmpl->mode == XFRM_MODE_BEET) { remote = &tmpl->id.daddr; local = &tmpl->saddr; if (xfrm_addr_any(local, tmpl->encap_family)) { error = xfrm_get_saddr(net, fl->flowi_oif, &tmp, remote, tmpl->encap_family, 0); if (error) goto fail; local = &tmp; } } x = xfrm_state_find(remote, local, fl, tmpl, policy, &error, family, policy->if_id); if (x && x->km.state == XFRM_STATE_VALID) { xfrm[nx++] = x; daddr = remote; saddr = local; continue; } if (x) { error = (x->km.state == XFRM_STATE_ERROR ? -EINVAL : -EAGAIN); xfrm_state_put(x); } else if (error == -ESRCH) { error = -EAGAIN; } if (!tmpl->optional) goto fail; } return nx; fail: for (nx--; nx >= 0; nx--) xfrm_state_put(xfrm[nx]); return error; } static int xfrm_tmpl_resolve(struct xfrm_policy **pols, int npols, const struct flowi *fl, struct xfrm_state **xfrm, unsigned short family) { struct xfrm_state *tp[XFRM_MAX_DEPTH]; struct xfrm_state **tpp = (npols > 1) ? tp : xfrm; int cnx = 0; int error; int ret; int i; for (i = 0; i < npols; i++) { if (cnx + pols[i]->xfrm_nr >= XFRM_MAX_DEPTH) { error = -ENOBUFS; goto fail; } ret = xfrm_tmpl_resolve_one(pols[i], fl, &tpp[cnx], family); if (ret < 0) { error = ret; goto fail; } else cnx += ret; } /* found states are sorted for outbound processing */ if (npols > 1) xfrm_state_sort(xfrm, tpp, cnx, family); return cnx; fail: for (cnx--; cnx >= 0; cnx--) xfrm_state_put(tpp[cnx]); return error; } static int xfrm_get_tos(const struct flowi *fl, int family) { if (family == AF_INET) return IPTOS_RT_MASK & fl->u.ip4.flowi4_tos; return 0; } static inline struct xfrm_dst *xfrm_alloc_dst(struct net *net, int family) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_ops *dst_ops; struct xfrm_dst *xdst; if (!afinfo) return ERR_PTR(-EINVAL); switch (family) { case AF_INET: dst_ops = &net->xfrm.xfrm4_dst_ops; break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: dst_ops = &net->xfrm.xfrm6_dst_ops; break; #endif default: BUG(); } xdst = dst_alloc(dst_ops, NULL, 1, DST_OBSOLETE_NONE, 0); if (likely(xdst)) { struct dst_entry *dst = &xdst->u.dst; memset(dst + 1, 0, sizeof(*xdst) - sizeof(*dst)); } else xdst = ERR_PTR(-ENOBUFS); rcu_read_unlock(); return xdst; } static void xfrm_init_path(struct xfrm_dst *path, struct dst_entry *dst, int nfheader_len) { if (dst->ops->family == AF_INET6) { struct rt6_info *rt = (struct rt6_info *)dst; path->path_cookie = rt6_get_cookie(rt); path->u.rt6.rt6i_nfheader_len = nfheader_len; } } static inline int xfrm_fill_dst(struct xfrm_dst *xdst, struct net_device *dev, const struct flowi *fl) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(xdst->u.dst.ops->family); int err; if (!afinfo) return -EINVAL; err = afinfo->fill_dst(xdst, dev, fl); rcu_read_unlock(); return err; } /* Allocate chain of dst_entry's, attach known xfrm's, calculate * all the metrics... Shortly, bundle a bundle. */ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, struct xfrm_dst **bundle, int nx, const struct flowi *fl, struct dst_entry *dst) { const struct xfrm_state_afinfo *afinfo; const struct xfrm_mode *inner_mode; struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; struct xfrm_dst *xdst_prev = NULL; struct xfrm_dst *xdst0 = NULL; int i = 0; int err; int header_len = 0; int nfheader_len = 0; int trailer_len = 0; int tos; int family = policy->selector.family; xfrm_address_t saddr, daddr; xfrm_flowi_addr_get(fl, &saddr, &daddr, family); tos = xfrm_get_tos(fl, family); dst_hold(dst); for (; i < nx; i++) { struct xfrm_dst *xdst = xfrm_alloc_dst(net, family); struct dst_entry *dst1 = &xdst->u.dst; err = PTR_ERR(xdst); if (IS_ERR(xdst)) { dst_release(dst); goto put_states; } bundle[i] = xdst; if (!xdst_prev) xdst0 = xdst; else /* Ref count is taken during xfrm_alloc_dst() * No need to do dst_clone() on dst1 */ xfrm_dst_set_child(xdst_prev, &xdst->u.dst); if (xfrm[i]->sel.family == AF_UNSPEC) { inner_mode = xfrm_ip2inner_mode(xfrm[i], xfrm_af2proto(family)); if (!inner_mode) { err = -EAFNOSUPPORT; dst_release(dst); goto put_states; } } else inner_mode = &xfrm[i]->inner_mode; xdst->route = dst; dst_copy_metrics(dst1, dst); if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { __u32 mark = 0; if (xfrm[i]->props.smark.v || xfrm[i]->props.smark.m) mark = xfrm_smark_get(fl->flowi_mark, xfrm[i]); family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, &saddr, &daddr, family, mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; } else dst_hold(dst); dst1->xfrm = xfrm[i]; xdst->xfrm_genid = xfrm[i]->genid; dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->lastuse = now; dst1->input = dst_discard; rcu_read_lock(); afinfo = xfrm_state_afinfo_get_rcu(inner_mode->family); if (likely(afinfo)) dst1->output = afinfo->output; else dst1->output = dst_discard_out; rcu_read_unlock(); xdst_prev = xdst; header_len += xfrm[i]->props.header_len; if (xfrm[i]->type->flags & XFRM_TYPE_NON_FRAGMENT) nfheader_len += xfrm[i]->props.header_len; trailer_len += xfrm[i]->props.trailer_len; } xfrm_dst_set_child(xdst_prev, dst); xdst0->path = dst; err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; xfrm_init_path(xdst0, dst, nfheader_len); xfrm_init_pmtu(bundle, nx); for (xdst_prev = xdst0; xdst_prev != (struct xfrm_dst *)dst; xdst_prev = (struct xfrm_dst *) xfrm_dst_child(&xdst_prev->u.dst)) { err = xfrm_fill_dst(xdst_prev, dev, fl); if (err) goto free_dst; xdst_prev->u.dst.header_len = header_len; xdst_prev->u.dst.trailer_len = trailer_len; header_len -= xdst_prev->u.dst.xfrm->props.header_len; trailer_len -= xdst_prev->u.dst.xfrm->props.trailer_len; } return &xdst0->u.dst; put_states: for (; i < nx; i++) xfrm_state_put(xfrm[i]); free_dst: if (xdst0) dst_release_immediate(&xdst0->u.dst); return ERR_PTR(err); } static int xfrm_expand_policies(const struct flowi *fl, u16 family, struct xfrm_policy **pols, int *num_pols, int *num_xfrms) { int i; if (*num_pols == 0 || !pols[0]) { *num_pols = 0; *num_xfrms = 0; return 0; } if (IS_ERR(pols[0])) return PTR_ERR(pols[0]); *num_xfrms = pols[0]->xfrm_nr; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0] && pols[0]->action == XFRM_POLICY_ALLOW && pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(xp_net(pols[0]), XFRM_POLICY_TYPE_MAIN, fl, family, XFRM_POLICY_OUT, pols[0]->if_id); if (pols[1]) { if (IS_ERR(pols[1])) { xfrm_pols_put(pols, *num_pols); return PTR_ERR(pols[1]); } (*num_pols)++; (*num_xfrms) += pols[1]->xfrm_nr; } } #endif for (i = 0; i < *num_pols; i++) { if (pols[i]->action != XFRM_POLICY_ALLOW) { *num_xfrms = -1; break; } } return 0; } static struct xfrm_dst * xfrm_resolve_and_create_bundle(struct xfrm_policy **pols, int num_pols, const struct flowi *fl, u16 family, struct dst_entry *dst_orig) { struct net *net = xp_net(pols[0]); struct xfrm_state *xfrm[XFRM_MAX_DEPTH]; struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct xfrm_dst *xdst; struct dst_entry *dst; int err; /* Try to instantiate a bundle */ err = xfrm_tmpl_resolve(pols, num_pols, fl, xfrm, family); if (err <= 0) { if (err == 0) return NULL; if (err != -EAGAIN) XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); return ERR_PTR(err); } dst = xfrm_bundle_create(pols[0], xfrm, bundle, err, fl, dst_orig); if (IS_ERR(dst)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTBUNDLEGENERROR); return ERR_CAST(dst); } xdst = (struct xfrm_dst *)dst; xdst->num_xfrms = err; xdst->num_pols = num_pols; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); xdst->policy_genid = atomic_read(&pols[0]->genid); return xdst; } static void xfrm_policy_queue_process(struct timer_list *t) { struct sk_buff *skb; struct sock *sk; struct dst_entry *dst; struct xfrm_policy *pol = from_timer(pol, t, polq.hold_timer); struct net *net = xp_net(pol); struct xfrm_policy_queue *pq = &pol->polq; struct flowi fl; struct sk_buff_head list; __u32 skb_mark; spin_lock(&pq->hold_queue.lock); skb = skb_peek(&pq->hold_queue); if (!skb) { spin_unlock(&pq->hold_queue.lock); goto out; } dst = skb_dst(skb); sk = skb->sk; /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, dst->ops->family); skb->mark = skb_mark; spin_unlock(&pq->hold_queue.lock); dst_hold(xfrm_dst_path(dst)); dst = xfrm_lookup(net, xfrm_dst_path(dst), &fl, sk, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) goto purge_queue; if (dst->flags & DST_XFRM_QUEUE) { dst_release(dst); if (pq->timeout >= XFRM_QUEUE_TMO_MAX) goto purge_queue; pq->timeout = pq->timeout << 1; if (!mod_timer(&pq->hold_timer, jiffies + pq->timeout)) xfrm_pol_hold(pol); goto out; } dst_release(dst); __skb_queue_head_init(&list); spin_lock(&pq->hold_queue.lock); pq->timeout = 0; skb_queue_splice_init(&pq->hold_queue, &list); spin_unlock(&pq->hold_queue.lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); /* Fixup the mark to support VTI. */ skb_mark = skb->mark; skb->mark = pol->mark.v; xfrm_decode_session(skb, &fl, skb_dst(skb)->ops->family); skb->mark = skb_mark; dst_hold(xfrm_dst_path(skb_dst(skb))); dst = xfrm_lookup(net, xfrm_dst_path(skb_dst(skb)), &fl, skb->sk, 0); if (IS_ERR(dst)) { kfree_skb(skb); continue; } nf_reset_ct(skb); skb_dst_drop(skb); skb_dst_set(skb, dst); dst_output(net, skb->sk, skb); } out: xfrm_pol_put(pol); return; purge_queue: pq->timeout = 0; skb_queue_purge(&pq->hold_queue); xfrm_pol_put(pol); } static int xdst_queue_output(struct net *net, struct sock *sk, struct sk_buff *skb) { unsigned long sched_next; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *) dst; struct xfrm_policy *pol = xdst->pols[0]; struct xfrm_policy_queue *pq = &pol->polq; if (unlikely(skb_fclone_busy(sk, skb))) { kfree_skb(skb); return 0; } if (pq->hold_queue.qlen > XFRM_MAX_QUEUE_LEN) { kfree_skb(skb); return -EAGAIN; } skb_dst_force(skb); spin_lock_bh(&pq->hold_queue.lock); if (!pq->timeout) pq->timeout = XFRM_QUEUE_TMO_MIN; sched_next = jiffies + pq->timeout; if (del_timer(&pq->hold_timer)) { if (time_before(pq->hold_timer.expires, sched_next)) sched_next = pq->hold_timer.expires; xfrm_pol_put(pol); } __skb_queue_tail(&pq->hold_queue, skb); if (!mod_timer(&pq->hold_timer, sched_next)) xfrm_pol_hold(pol); spin_unlock_bh(&pq->hold_queue.lock); return 0; } static struct xfrm_dst *xfrm_create_dummy_bundle(struct net *net, struct xfrm_flo *xflo, const struct flowi *fl, int num_xfrms, u16 family) { int err; struct net_device *dev; struct dst_entry *dst; struct dst_entry *dst1; struct xfrm_dst *xdst; xdst = xfrm_alloc_dst(net, family); if (IS_ERR(xdst)) return xdst; if (!(xflo->flags & XFRM_LOOKUP_QUEUE) || net->xfrm.sysctl_larval_drop || num_xfrms <= 0) return xdst; dst = xflo->dst_orig; dst1 = &xdst->u.dst; dst_hold(dst); xdst->route = dst; dst_copy_metrics(dst1, dst); dst1->obsolete = DST_OBSOLETE_FORCE_CHK; dst1->flags |= DST_XFRM_QUEUE; dst1->lastuse = jiffies; dst1->input = dst_discard; dst1->output = xdst_queue_output; dst_hold(dst); xfrm_dst_set_child(xdst, dst); xdst->path = dst; xfrm_init_path((struct xfrm_dst *)dst1, dst, 0); err = -ENODEV; dev = dst->dev; if (!dev) goto free_dst; err = xfrm_fill_dst(xdst, dev, fl); if (err) goto free_dst; out: return xdst; free_dst: dst_release(dst1); xdst = ERR_PTR(err); goto out; } static struct xfrm_dst *xfrm_bundle_lookup(struct net *net, const struct flowi *fl, u16 family, u8 dir, struct xfrm_flo *xflo, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int num_pols = 0, num_xfrms = 0, err; struct xfrm_dst *xdst; /* Resolve policies to use if we couldn't get them from * previous cache entry */ num_pols = 1; pols[0] = xfrm_policy_lookup(net, fl, family, dir, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto inc_error; if (num_pols == 0) return NULL; if (num_xfrms <= 0) goto make_dummy_bundle; xdst = xfrm_resolve_and_create_bundle(pols, num_pols, fl, family, xflo->dst_orig); if (IS_ERR(xdst)) { err = PTR_ERR(xdst); if (err == -EREMOTE) { xfrm_pols_put(pols, num_pols); return NULL; } if (err != -EAGAIN) goto error; goto make_dummy_bundle; } else if (xdst == NULL) { num_xfrms = 0; goto make_dummy_bundle; } return xdst; make_dummy_bundle: /* We found policies, but there's no bundles to instantiate: * either because the policy blocks, has no transformations or * we could not build template (no xfrm_states).*/ xdst = xfrm_create_dummy_bundle(net, xflo, fl, num_xfrms, family); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); return ERR_CAST(xdst); } xdst->num_pols = num_pols; xdst->num_xfrms = num_xfrms; memcpy(xdst->pols, pols, sizeof(struct xfrm_policy *) * num_pols); return xdst; inc_error: XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLERROR); error: xfrm_pols_put(pols, num_pols); return ERR_PTR(err); } static struct dst_entry *make_blackhole(struct net *net, u16 family, struct dst_entry *dst_orig) { const struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); struct dst_entry *ret; if (!afinfo) { dst_release(dst_orig); return ERR_PTR(-EINVAL); } else { ret = afinfo->blackhole_route(net, dst_orig); } rcu_read_unlock(); return ret; } /* Finds/creates a bundle for given flow and if_id * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. * * xfrm_lookup uses an if_id of 0 by default, and is provided for * compatibility */ struct dst_entry *xfrm_lookup_with_ifid(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags, u32 if_id) { struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; struct xfrm_dst *xdst; struct dst_entry *dst, *route; u16 family = dst_orig->ops->family; u8 dir = XFRM_POLICY_OUT; int i, err, num_pols, num_xfrms = 0, drop_pols = 0; dst = NULL; xdst = NULL; route = NULL; sk = sk_const_to_full_sk(sk); if (sk && sk->sk_policy[XFRM_POLICY_OUT]) { num_pols = 1; pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, family, if_id); err = xfrm_expand_policies(fl, family, pols, &num_pols, &num_xfrms); if (err < 0) goto dropdst; if (num_pols) { if (num_xfrms <= 0) { drop_pols = num_pols; goto no_transform; } xdst = xfrm_resolve_and_create_bundle( pols, num_pols, fl, family, dst_orig); if (IS_ERR(xdst)) { xfrm_pols_put(pols, num_pols); err = PTR_ERR(xdst); if (err == -EREMOTE) goto nopol; goto dropdst; } else if (xdst == NULL) { num_xfrms = 0; drop_pols = num_pols; goto no_transform; } route = xdst->route; } } if (xdst == NULL) { struct xfrm_flo xflo; xflo.dst_orig = dst_orig; xflo.flags = flags; /* To accelerate a bit... */ if (!if_id && ((dst_orig->flags & DST_NOXFRM) || !net->xfrm.policy_count[XFRM_POLICY_OUT])) goto nopol; xdst = xfrm_bundle_lookup(net, fl, family, dir, &xflo, if_id); if (xdst == NULL) goto nopol; if (IS_ERR(xdst)) { err = PTR_ERR(xdst); goto dropdst; } num_pols = xdst->num_pols; num_xfrms = xdst->num_xfrms; memcpy(pols, xdst->pols, sizeof(struct xfrm_policy *) * num_pols); route = xdst->route; } dst = &xdst->u.dst; if (route == NULL && num_xfrms > 0) { /* The only case when xfrm_bundle_lookup() returns a * bundle with null route, is when the template could * not be resolved. It means policies are there, but * bundle could not be created, since we don't yet * have the xfrm_state's. We need to wait for KM to * negotiate new SA's or bail out with error.*/ if (net->xfrm.sysctl_larval_drop) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); err = -EREMOTE; goto error; } err = -EAGAIN; XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTNOSTATES); goto error; } no_transform: if (num_pols == 0) goto nopol; if ((flags & XFRM_LOOKUP_ICMP) && !(pols[0]->flags & XFRM_POLICY_ICMP)) { err = -ENOENT; goto error; } for (i = 0; i < num_pols; i++) pols[i]->curlft.use_time = ktime_get_real_seconds(); if (num_xfrms < 0) { /* Prohibit the flow */ XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTPOLBLOCK); err = -EPERM; goto error; } else if (num_xfrms > 0) { /* Flow transformed */ dst_release(dst_orig); } else { /* Flow passes untransformed */ dst_release(dst); dst = dst_orig; } ok: xfrm_pols_put(pols, drop_pols); if (dst && dst->xfrm && dst->xfrm->props.mode == XFRM_MODE_TUNNEL) dst->flags |= DST_XFRM_TUNNEL; return dst; nopol: if (!(flags & XFRM_LOOKUP_ICMP)) { dst = dst_orig; goto ok; } err = -ENOENT; error: dst_release(dst); dropdst: if (!(flags & XFRM_LOOKUP_KEEP_DST_REF)) dst_release(dst_orig); xfrm_pols_put(pols, drop_pols); return ERR_PTR(err); } EXPORT_SYMBOL(xfrm_lookup_with_ifid); /* Main function: finds/creates a bundle for given flow. * * At the moment we eat a raw IP route. Mostly to speed up lookups * on interfaces with disabled IPsec. */ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { return xfrm_lookup_with_ifid(net, dst_orig, fl, sk, flags, 0); } EXPORT_SYMBOL(xfrm_lookup); /* Callers of xfrm_lookup_route() must ensure a call to dst_output(). * Otherwise we may send out blackholed packets. */ struct dst_entry *xfrm_lookup_route(struct net *net, struct dst_entry *dst_orig, const struct flowi *fl, const struct sock *sk, int flags) { struct dst_entry *dst = xfrm_lookup(net, dst_orig, fl, sk, flags | XFRM_LOOKUP_QUEUE | XFRM_LOOKUP_KEEP_DST_REF); if (PTR_ERR(dst) == -EREMOTE) return make_blackhole(net, dst_orig->ops->family, dst_orig); if (IS_ERR(dst)) dst_release(dst_orig); return dst; } EXPORT_SYMBOL(xfrm_lookup_route); static inline int xfrm_secpath_reject(int idx, struct sk_buff *skb, const struct flowi *fl) { struct sec_path *sp = skb_sec_path(skb); struct xfrm_state *x; if (!sp || idx < 0 || idx >= sp->len) return 0; x = sp->xvec[idx]; if (!x->type->reject) return 0; return x->type->reject(x, skb, fl); } /* When skb is transformed back to its "native" form, we have to * check policy restrictions. At the moment we make this in maximally * stupid way. Shame on me. :-) Of course, connected sockets must * have policy cached at them. */ static inline int xfrm_state_ok(const struct xfrm_tmpl *tmpl, const struct xfrm_state *x, unsigned short family) { if (xfrm_state_kern(x)) return tmpl->optional && !xfrm_state_addr_cmp(tmpl, x, tmpl->encap_family); return x->id.proto == tmpl->id.proto && (x->id.spi == tmpl->id.spi || !tmpl->id.spi) && (x->props.reqid == tmpl->reqid || !tmpl->reqid) && x->props.mode == tmpl->mode && (tmpl->allalgs || (tmpl->aalgos & (1<<x->props.aalgo)) || !(xfrm_id_proto_match(tmpl->id.proto, IPSEC_PROTO_ANY))) && !(x->props.mode != XFRM_MODE_TRANSPORT && xfrm_state_addr_cmp(tmpl, x, family)); } /* * 0 or more than 0 is returned when validation is succeeded (either bypass * because of optional transport mode, or next index of the mathced secpath * state with the template. * -1 is returned when no matching template is found. * Otherwise "-2 - errored_index" is returned. */ static inline int xfrm_policy_ok(const struct xfrm_tmpl *tmpl, const struct sec_path *sp, int start, unsigned short family) { int idx = start; if (tmpl->optional) { if (tmpl->mode == XFRM_MODE_TRANSPORT) return start; } else start = -1; for (; idx < sp->len; idx++) { if (xfrm_state_ok(tmpl, sp->xvec[idx], family)) return ++idx; if (sp->xvec[idx]->props.mode != XFRM_MODE_TRANSPORT) { if (start == -1) start = -2-idx; break; } } return start; } static void decode_session4(struct sk_buff *skb, struct flowi *fl, bool reverse) { const struct iphdr *iph = ip_hdr(skb); int ihl = iph->ihl; u8 *xprth = skb_network_header(skb) + ihl * 4; struct flowi4 *fl4 = &fl->u.ip4; int oif = 0; if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl4, 0, sizeof(struct flowi4)); fl4->flowi4_mark = skb->mark; fl4->flowi4_oif = reverse ? skb->skb_iif : oif; fl4->flowi4_proto = iph->protocol; fl4->daddr = reverse ? iph->saddr : iph->daddr; fl4->saddr = reverse ? iph->daddr : iph->saddr; fl4->flowi4_tos = iph->tos; if (!ip_is_fragment(iph)) { switch (iph->protocol) { case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ports; xprth = skb_network_header(skb) + ihl * 4; ports = (__be16 *)xprth; fl4->fl4_sport = ports[!!reverse]; fl4->fl4_dport = ports[!reverse]; } break; case IPPROTO_ICMP: if (xprth + 2 < skb->data || pskb_may_pull(skb, xprth + 2 - skb->data)) { u8 *icmp; xprth = skb_network_header(skb) + ihl * 4; icmp = xprth; fl4->fl4_icmp_type = icmp[0]; fl4->fl4_icmp_code = icmp[1]; } break; case IPPROTO_ESP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { __be32 *ehdr; xprth = skb_network_header(skb) + ihl * 4; ehdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ehdr[0]; } break; case IPPROTO_AH: if (xprth + 8 < skb->data || pskb_may_pull(skb, xprth + 8 - skb->data)) { __be32 *ah_hdr; xprth = skb_network_header(skb) + ihl * 4; ah_hdr = (__be32 *)xprth; fl4->fl4_ipsec_spi = ah_hdr[1]; } break; case IPPROTO_COMP: if (xprth + 4 < skb->data || pskb_may_pull(skb, xprth + 4 - skb->data)) { __be16 *ipcomp_hdr; xprth = skb_network_header(skb) + ihl * 4; ipcomp_hdr = (__be16 *)xprth; fl4->fl4_ipsec_spi = htonl(ntohs(ipcomp_hdr[1])); } break; case IPPROTO_GRE: if (xprth + 12 < skb->data || pskb_may_pull(skb, xprth + 12 - skb->data)) { __be16 *greflags; __be32 *gre_hdr; xprth = skb_network_header(skb) + ihl * 4; greflags = (__be16 *)xprth; gre_hdr = (__be32 *)xprth; if (greflags[0] & GRE_KEY) { if (greflags[0] & GRE_CSUM) gre_hdr++; fl4->fl4_gre_key = gre_hdr[1]; } } break; default: fl4->fl4_ipsec_spi = 0; break; } } } #if IS_ENABLED(CONFIG_IPV6) static void decode_session6(struct sk_buff *skb, struct flowi *fl, bool reverse) { struct flowi6 *fl6 = &fl->u.ip6; int onlyproto = 0; const struct ipv6hdr *hdr = ipv6_hdr(skb); u32 offset = sizeof(*hdr); struct ipv6_opt_hdr *exthdr; const unsigned char *nh = skb_network_header(skb); u16 nhoff = IP6CB(skb)->nhoff; int oif = 0; u8 nexthdr; if (!nhoff) nhoff = offsetof(struct ipv6hdr, nexthdr); nexthdr = nh[nhoff]; if (skb_dst(skb) && skb_dst(skb)->dev) oif = skb_dst(skb)->dev->ifindex; memset(fl6, 0, sizeof(struct flowi6)); fl6->flowi6_mark = skb->mark; fl6->flowi6_oif = reverse ? skb->skb_iif : oif; fl6->daddr = reverse ? hdr->saddr : hdr->daddr; fl6->saddr = reverse ? hdr->daddr : hdr->saddr; while (nh + offset + sizeof(*exthdr) < skb->data || pskb_may_pull(skb, nh + offset + sizeof(*exthdr) - skb->data)) { nh = skb_network_header(skb); exthdr = (struct ipv6_opt_hdr *)(nh + offset); switch (nexthdr) { case NEXTHDR_FRAGMENT: onlyproto = 1; fallthrough; case NEXTHDR_ROUTING: case NEXTHDR_HOP: case NEXTHDR_DEST: offset += ipv6_optlen(exthdr); nexthdr = exthdr->nexthdr; exthdr = (struct ipv6_opt_hdr *)(nh + offset); break; case IPPROTO_UDP: case IPPROTO_UDPLITE: case IPPROTO_TCP: case IPPROTO_SCTP: case IPPROTO_DCCP: if (!onlyproto && (nh + offset + 4 < skb->data || pskb_may_pull(skb, nh + offset + 4 - skb->data))) { __be16 *ports; nh = skb_network_header(skb); ports = (__be16 *)(nh + offset); fl6->fl6_sport = ports[!!reverse]; fl6->fl6_dport = ports[!reverse]; } fl6->flowi6_proto = nexthdr; return; case IPPROTO_ICMPV6: if (!onlyproto && (nh + offset + 2 < skb->data || pskb_may_pull(skb, nh + offset + 2 - skb->data))) { u8 *icmp; nh = skb_network_header(skb); icmp = (u8 *)(nh + offset); fl6->fl6_icmp_type = icmp[0]; fl6->fl6_icmp_code = icmp[1]; } fl6->flowi6_proto = nexthdr; return; #if IS_ENABLED(CONFIG_IPV6_MIP6) case IPPROTO_MH: offset += ipv6_optlen(exthdr); if (!onlyproto && (nh + offset + 3 < skb->data || pskb_may_pull(skb, nh + offset + 3 - skb->data))) { struct ip6_mh *mh; nh = skb_network_header(skb); mh = (struct ip6_mh *)(nh + offset); fl6->fl6_mh_type = mh->ip6mh_type; } fl6->flowi6_proto = nexthdr; return; #endif /* XXX Why are there these headers? */ case IPPROTO_AH: case IPPROTO_ESP: case IPPROTO_COMP: default: fl6->fl6_ipsec_spi = 0; fl6->flowi6_proto = nexthdr; return; } } } #endif int __xfrm_decode_session(struct sk_buff *skb, struct flowi *fl, unsigned int family, int reverse) { switch (family) { case AF_INET: decode_session4(skb, fl, reverse); break; #if IS_ENABLED(CONFIG_IPV6) case AF_INET6: decode_session6(skb, fl, reverse); break; #endif default: return -EAFNOSUPPORT; } return security_xfrm_decode_session(skb, &fl->flowi_secid); } EXPORT_SYMBOL(__xfrm_decode_session); static inline int secpath_has_nontransport(const struct sec_path *sp, int k, int *idxp) { for (; k < sp->len; k++) { if (sp->xvec[k]->props.mode != XFRM_MODE_TRANSPORT) { *idxp = k; return 1; } } return 0; } int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct xfrm_policy *pol; struct xfrm_policy *pols[XFRM_POLICY_TYPE_MAX]; int npols = 0; int xfrm_nr; int pi; int reverse; struct flowi fl; int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct sec_path *sp; struct xfrm_if *xi; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { xi = ifcb->decode_session(skb, family); if (xi) { if_id = xi->p.if_id; net = xi->net; } } rcu_read_unlock(); reverse = dir & ~XFRM_POLICY_MASK; dir &= XFRM_POLICY_MASK; if (__xfrm_decode_session(skb, &fl, family, reverse) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINHDRERROR); return 0; } nf_nat_decode_session(skb, &fl, family); /* First, check used SA against their selectors. */ sp = skb_sec_path(skb); if (sp) { int i; for (i = sp->len - 1; i >= 0; i--) { struct xfrm_state *x = sp->xvec[i]; if (!xfrm_selector_match(&x->sel, &fl, family)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINSTATEMISMATCH); return 0; } } } pol = NULL; sk = sk_to_full_sk(sk); if (sk && sk->sk_policy[dir]) { pol = xfrm_sk_policy_lookup(sk, dir, &fl, family, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } } if (!pol) pol = xfrm_policy_lookup(net, &fl, family, dir, if_id); if (IS_ERR(pol)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } if (!pol) { if (sp && secpath_has_nontransport(sp, 0, &xerr_idx)) { xfrm_secpath_reject(xerr_idx, skb, &fl); XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOPOLS); return 0; } return 1; } pol->curlft.use_time = ktime_get_real_seconds(); pols[0] = pol; npols++; #ifdef CONFIG_XFRM_SUB_POLICY if (pols[0]->type != XFRM_POLICY_TYPE_MAIN) { pols[1] = xfrm_policy_lookup_bytype(net, XFRM_POLICY_TYPE_MAIN, &fl, family, XFRM_POLICY_IN, if_id); if (pols[1]) { if (IS_ERR(pols[1])) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLERROR); return 0; } pols[1]->curlft.use_time = ktime_get_real_seconds(); npols++; } } #endif if (pol->action == XFRM_POLICY_ALLOW) { static struct sec_path dummy; struct xfrm_tmpl *tp[XFRM_MAX_DEPTH]; struct xfrm_tmpl *stp[XFRM_MAX_DEPTH]; struct xfrm_tmpl **tpp = tp; int ti = 0; int i, k; sp = skb_sec_path(skb); if (!sp) sp = &dummy; for (pi = 0; pi < npols; pi++) { if (pols[pi] != pol && pols[pi]->action != XFRM_POLICY_ALLOW) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); goto reject; } if (ti + pols[pi]->xfrm_nr >= XFRM_MAX_DEPTH) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINBUFFERERROR); goto reject_error; } for (i = 0; i < pols[pi]->xfrm_nr; i++) tpp[ti++] = &pols[pi]->xfrm_vec[i]; } xfrm_nr = ti; if (npols > 1) { xfrm_tmpl_sort(stp, tpp, xfrm_nr, family); tpp = stp; } /* For each tunnel xfrm, find the first matching tmpl. * For each tmpl before that, find corresponding xfrm. * Order is _important_. Later we will implement * some barriers, but at the moment barriers * are implied between each two transformations. */ for (i = xfrm_nr-1, k = 0; i >= 0; i--) { k = xfrm_policy_ok(tpp[i], sp, k, family); if (k < 0) { if (k < -1) /* "-2 - errored_index" returned */ xerr_idx = -(2+k); XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } } if (secpath_has_nontransport(sp, k, &xerr_idx)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMINTMPLMISMATCH); goto reject; } xfrm_pols_put(pols, npols); return 1; } XFRM_INC_STATS(net, LINUX_MIB_XFRMINPOLBLOCK); reject: xfrm_secpath_reject(xerr_idx, skb, &fl); reject_error: xfrm_pols_put(pols, npols); return 0; } EXPORT_SYMBOL(__xfrm_policy_check); int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct flowi fl; struct dst_entry *dst; int res = 1; if (xfrm_decode_session(skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); if (!skb_dst(skb)) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } skb_dst_set(skb, dst); return res; } EXPORT_SYMBOL(__xfrm_route_forward); /* Optimize later using cookies and generation ids. */ static struct dst_entry *xfrm_dst_check(struct dst_entry *dst, u32 cookie) { /* Code (such as __xfrm4_bundle_create()) sets dst->obsolete * to DST_OBSOLETE_FORCE_CHK to force all XFRM destinations to * get validated by dst_ops->check on every use. We do this * because when a normal route referenced by an XFRM dst is * obsoleted we do not go looking around for all parent * referencing XFRM dsts so that we can invalidate them. It * is just too much work. Instead we make the checks here on * every use. For example: * * XFRM dst A --> IPv4 dst X * * X is the "xdst->route" of A (X is also the "dst->path" of A * in this example). If X is marked obsolete, "A" will not * notice. That's what we are validating here via the * stale_bundle() check. * * When a dst is removed from the fib tree, DST_OBSOLETE_DEAD will * be marked on it. * This will force stale_bundle() to fail on any xdst bundle with * this dst linked in it. */ if (dst->obsolete < 0 && !stale_bundle(dst)) return dst; return NULL; } static int stale_bundle(struct dst_entry *dst) { return !xfrm_bundle_ok((struct xfrm_dst *)dst); } void xfrm_dst_ifdown(struct dst_entry *dst, struct net_device *dev) { while ((dst = xfrm_dst_child(dst)) && dst->xfrm && dst->dev == dev) { dst->dev = dev_net(dev)->loopback_dev; dev_hold(dst->dev); dev_put(dev); } } EXPORT_SYMBOL(xfrm_dst_ifdown); static void xfrm_link_failure(struct sk_buff *skb) { /* Impossible. Such dst must be popped before reaches point of failure. */ } static struct dst_entry *xfrm_negative_advice(struct dst_entry *dst) { if (dst) { if (dst->obsolete) { dst_release(dst); dst = NULL; } } return dst; } static void xfrm_init_pmtu(struct xfrm_dst **bundle, int nr) { while (nr--) { struct xfrm_dst *xdst = bundle[nr]; u32 pmtu, route_mtu_cached; struct dst_entry *dst; dst = &xdst->u.dst; pmtu = dst_mtu(xfrm_dst_child(dst)); xdst->child_mtu_cached = pmtu; pmtu = xfrm_state_mtu(dst->xfrm, pmtu); route_mtu_cached = dst_mtu(xdst->route); xdst->route_mtu_cached = route_mtu_cached; if (pmtu > route_mtu_cached) pmtu = route_mtu_cached; dst_metric_set(dst, RTAX_MTU, pmtu); } } /* Check that the bundle accepts the flow and its components are * still valid. */ static int xfrm_bundle_ok(struct xfrm_dst *first) { struct xfrm_dst *bundle[XFRM_MAX_DEPTH]; struct dst_entry *dst = &first->u.dst; struct xfrm_dst *xdst; int start_from, nr; u32 mtu; if (!dst_check(xfrm_dst_path(dst), ((struct xfrm_dst *)dst)->path_cookie) || (dst->dev && !netif_running(dst->dev))) return 0; if (dst->flags & DST_XFRM_QUEUE) return 1; start_from = nr = 0; do { struct xfrm_dst *xdst = (struct xfrm_dst *)dst; if (dst->xfrm->km.state != XFRM_STATE_VALID) return 0; if (xdst->xfrm_genid != dst->xfrm->genid) return 0; if (xdst->num_pols > 0 && xdst->policy_genid != atomic_read(&xdst->pols[0]->genid)) return 0; bundle[nr++] = xdst; mtu = dst_mtu(xfrm_dst_child(dst)); if (xdst->child_mtu_cached != mtu) { start_from = nr; xdst->child_mtu_cached = mtu; } if (!dst_check(xdst->route, xdst->route_cookie)) return 0; mtu = dst_mtu(xdst->route); if (xdst->route_mtu_cached != mtu) { start_from = nr; xdst->route_mtu_cached = mtu; } dst = xfrm_dst_child(dst); } while (dst->xfrm); if (likely(!start_from)) return 1; xdst = bundle[start_from - 1]; mtu = xdst->child_mtu_cached; while (start_from--) { dst = &xdst->u.dst; mtu = xfrm_state_mtu(dst->xfrm, mtu); if (mtu > xdst->route_mtu_cached) mtu = xdst->route_mtu_cached; dst_metric_set(dst, RTAX_MTU, mtu); if (!start_from) break; xdst = bundle[start_from - 1]; xdst->child_mtu_cached = mtu; } return 1; } static unsigned int xfrm_default_advmss(const struct dst_entry *dst) { return dst_metric_advmss(xfrm_dst_path(dst)); } static unsigned int xfrm_mtu(const struct dst_entry *dst) { unsigned int mtu = dst_metric_raw(dst, RTAX_MTU); return mtu ? : dst_mtu(xfrm_dst_path(dst)); } static const void *xfrm_get_dst_nexthop(const struct dst_entry *dst, const void *daddr) { while (dst->xfrm) { const struct xfrm_state *xfrm = dst->xfrm; dst = xfrm_dst_child(dst); if (xfrm->props.mode == XFRM_MODE_TRANSPORT) continue; if (xfrm->type->flags & XFRM_TYPE_REMOTE_COADDR) daddr = xfrm->coaddr; else if (!(xfrm->type->flags & XFRM_TYPE_LOCAL_COADDR)) daddr = &xfrm->id.daddr; } return daddr; } static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst, struct sk_buff *skb, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); if (!skb) daddr = xfrm_get_dst_nexthop(dst, daddr); return path->ops->neigh_lookup(path, skb, daddr); } static void xfrm_confirm_neigh(const struct dst_entry *dst, const void *daddr) { const struct dst_entry *path = xfrm_dst_path(dst); daddr = xfrm_get_dst_nexthop(dst, daddr); path->ops->confirm_neigh(path, daddr); } int xfrm_policy_register_afinfo(const struct xfrm_policy_afinfo *afinfo, int family) { int err = 0; if (WARN_ON(family >= ARRAY_SIZE(xfrm_policy_afinfo))) return -EAFNOSUPPORT; spin_lock(&xfrm_policy_afinfo_lock); if (unlikely(xfrm_policy_afinfo[family] != NULL)) err = -EEXIST; else { struct dst_ops *dst_ops = afinfo->dst_ops; if (likely(dst_ops->kmem_cachep == NULL)) dst_ops->kmem_cachep = xfrm_dst_cache; if (likely(dst_ops->check == NULL)) dst_ops->check = xfrm_dst_check; if (likely(dst_ops->default_advmss == NULL)) dst_ops->default_advmss = xfrm_default_advmss; if (likely(dst_ops->mtu == NULL)) dst_ops->mtu = xfrm_mtu; if (likely(dst_ops->negative_advice == NULL)) dst_ops->negative_advice = xfrm_negative_advice; if (likely(dst_ops->link_failure == NULL)) dst_ops->link_failure = xfrm_link_failure; if (likely(dst_ops->neigh_lookup == NULL)) dst_ops->neigh_lookup = xfrm_neigh_lookup; if (likely(!dst_ops->confirm_neigh)) dst_ops->confirm_neigh = xfrm_confirm_neigh; rcu_assign_pointer(xfrm_policy_afinfo[family], afinfo); } spin_unlock(&xfrm_policy_afinfo_lock); return err; } EXPORT_SYMBOL(xfrm_policy_register_afinfo); void xfrm_policy_unregister_afinfo(const struct xfrm_policy_afinfo *afinfo) { struct dst_ops *dst_ops = afinfo->dst_ops; int i; for (i = 0; i < ARRAY_SIZE(xfrm_policy_afinfo); i++) { if (xfrm_policy_afinfo[i] != afinfo) continue; RCU_INIT_POINTER(xfrm_policy_afinfo[i], NULL); break; } synchronize_rcu(); dst_ops->kmem_cachep = NULL; dst_ops->check = NULL; dst_ops->negative_advice = NULL; dst_ops->link_failure = NULL; } EXPORT_SYMBOL(xfrm_policy_unregister_afinfo); void xfrm_if_register_cb(const struct xfrm_if_cb *ifcb) { spin_lock(&xfrm_if_cb_lock); rcu_assign_pointer(xfrm_if_cb, ifcb); spin_unlock(&xfrm_if_cb_lock); } EXPORT_SYMBOL(xfrm_if_register_cb); void xfrm_if_unregister_cb(void) { RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } EXPORT_SYMBOL(xfrm_if_unregister_cb); #ifdef CONFIG_XFRM_STATISTICS static int __net_init xfrm_statistics_init(struct net *net) { int rv; net->mib.xfrm_statistics = alloc_percpu(struct linux_xfrm_mib); if (!net->mib.xfrm_statistics) return -ENOMEM; rv = xfrm_proc_init(net); if (rv < 0) free_percpu(net->mib.xfrm_statistics); return rv; } static void xfrm_statistics_fini(struct net *net) { xfrm_proc_fini(net); free_percpu(net->mib.xfrm_statistics); } #else static int __net_init xfrm_statistics_init(struct net *net) { return 0; } static void xfrm_statistics_fini(struct net *net) { } #endif static int __net_init xfrm_policy_init(struct net *net) { unsigned int hmask, sz; int dir, err; if (net_eq(net, &init_net)) { xfrm_dst_cache = kmem_cache_create("xfrm_dst_cache", sizeof(struct xfrm_dst), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); err = rhashtable_init(&xfrm_policy_inexact_table, &xfrm_pol_inexact_params); BUG_ON(err); } hmask = 8 - 1; sz = (hmask+1) * sizeof(struct hlist_head); net->xfrm.policy_byidx = xfrm_hash_alloc(sz); if (!net->xfrm.policy_byidx) goto out_byidx; net->xfrm.policy_idx_hmask = hmask; for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; net->xfrm.policy_count[dir] = 0; net->xfrm.policy_count[XFRM_POLICY_MAX + dir] = 0; INIT_HLIST_HEAD(&net->xfrm.policy_inexact[dir]); htab = &net->xfrm.policy_bydst[dir]; htab->table = xfrm_hash_alloc(sz); if (!htab->table) goto out_bydst; htab->hmask = hmask; htab->dbits4 = 32; htab->sbits4 = 32; htab->dbits6 = 128; htab->sbits6 = 128; } net->xfrm.policy_hthresh.lbits4 = 32; net->xfrm.policy_hthresh.rbits4 = 32; net->xfrm.policy_hthresh.lbits6 = 128; net->xfrm.policy_hthresh.rbits6 = 128; seqlock_init(&net->xfrm.policy_hthresh.lock); INIT_LIST_HEAD(&net->xfrm.policy_all); INIT_LIST_HEAD(&net->xfrm.inexact_bins); INIT_WORK(&net->xfrm.policy_hash_work, xfrm_hash_resize); INIT_WORK(&net->xfrm.policy_hthresh.work, xfrm_hash_rebuild); return 0; out_bydst: for (dir--; dir >= 0; dir--) { struct xfrm_policy_hash *htab; htab = &net->xfrm.policy_bydst[dir]; xfrm_hash_free(htab->table, sz); } xfrm_hash_free(net->xfrm.policy_byidx, sz); out_byidx: return -ENOMEM; } static void xfrm_policy_fini(struct net *net) { struct xfrm_pol_inexact_bin *b, *t; unsigned int sz; int dir; flush_work(&net->xfrm.policy_hash_work); #ifdef CONFIG_XFRM_SUB_POLICY xfrm_policy_flush(net, XFRM_POLICY_TYPE_SUB, false); #endif xfrm_policy_flush(net, XFRM_POLICY_TYPE_MAIN, false); WARN_ON(!list_empty(&net->xfrm.policy_all)); for (dir = 0; dir < XFRM_POLICY_MAX; dir++) { struct xfrm_policy_hash *htab; WARN_ON(!hlist_empty(&net->xfrm.policy_inexact[dir])); htab = &net->xfrm.policy_bydst[dir]; sz = (htab->hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(htab->table)); xfrm_hash_free(htab->table, sz); } sz = (net->xfrm.policy_idx_hmask + 1) * sizeof(struct hlist_head); WARN_ON(!hlist_empty(net->xfrm.policy_byidx)); xfrm_hash_free(net->xfrm.policy_byidx, sz); spin_lock_bh(&net->xfrm.xfrm_policy_lock); list_for_each_entry_safe(b, t, &net->xfrm.inexact_bins, inexact_bins) __xfrm_policy_inexact_prune_bin(b, true); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); } static int __net_init xfrm_net_init(struct net *net) { int rv; /* Initialize the per-net locks here */ spin_lock_init(&net->xfrm.xfrm_state_lock); spin_lock_init(&net->xfrm.xfrm_policy_lock); seqcount_spinlock_init(&net->xfrm.xfrm_policy_hash_generation, &net->xfrm.xfrm_policy_lock); mutex_init(&net->xfrm.xfrm_cfg_mutex); rv = xfrm_statistics_init(net); if (rv < 0) goto out_statistics; rv = xfrm_state_init(net); if (rv < 0) goto out_state; rv = xfrm_policy_init(net); if (rv < 0) goto out_policy; rv = xfrm_sysctl_init(net); if (rv < 0) goto out_sysctl; return 0; out_sysctl: xfrm_policy_fini(net); out_policy: xfrm_state_fini(net); out_state: xfrm_statistics_fini(net); out_statistics: return rv; } static void __net_exit xfrm_net_exit(struct net *net) { xfrm_sysctl_fini(net); xfrm_policy_fini(net); xfrm_state_fini(net); xfrm_statistics_fini(net); } static struct pernet_operations __net_initdata xfrm_net_ops = { .init = xfrm_net_init, .exit = xfrm_net_exit, }; void __init xfrm_init(void) { register_pernet_subsys(&xfrm_net_ops); xfrm_dev_init(); xfrm_input_init(); #ifdef CONFIG_XFRM_ESPINTCP espintcp_init(); #endif RCU_INIT_POINTER(xfrm_if_cb, NULL); synchronize_rcu(); } #ifdef CONFIG_AUDITSYSCALL static void xfrm_audit_common_policyinfo(struct xfrm_policy *xp, struct audit_buffer *audit_buf) { struct xfrm_sec_ctx *ctx = xp->security; struct xfrm_selector *sel = &xp->selector; if (ctx) audit_log_format(audit_buf, " sec_alg=%u sec_doi=%u sec_obj=%s", ctx->ctx_alg, ctx->ctx_doi, ctx->ctx_str); switch (sel->family) { case AF_INET: audit_log_format(audit_buf, " src=%pI4", &sel->saddr.a4); if (sel->prefixlen_s != 32) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI4", &sel->daddr.a4); if (sel->prefixlen_d != 32) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; case AF_INET6: audit_log_format(audit_buf, " src=%pI6", sel->saddr.a6); if (sel->prefixlen_s != 128) audit_log_format(audit_buf, " src_prefixlen=%d", sel->prefixlen_s); audit_log_format(audit_buf, " dst=%pI6", sel->daddr.a6); if (sel->prefixlen_d != 128) audit_log_format(audit_buf, " dst_prefixlen=%d", sel->prefixlen_d); break; } } void xfrm_audit_policy_add(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-add"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_add); void xfrm_audit_policy_delete(struct xfrm_policy *xp, int result, bool task_valid) { struct audit_buffer *audit_buf; audit_buf = xfrm_audit_start("SPD-delete"); if (audit_buf == NULL) return; xfrm_audit_helper_usrinfo(task_valid, audit_buf); audit_log_format(audit_buf, " res=%u", result); xfrm_audit_common_policyinfo(xp, audit_buf); audit_log_end(audit_buf); } EXPORT_SYMBOL_GPL(xfrm_audit_policy_delete); #endif #ifdef CONFIG_XFRM_MIGRATE static bool xfrm_migrate_selector_match(const struct xfrm_selector *sel_cmp, const struct xfrm_selector *sel_tgt) { if (sel_cmp->proto == IPSEC_ULPROTO_ANY) { if (sel_tgt->family == sel_cmp->family && xfrm_addr_equal(&sel_tgt->daddr, &sel_cmp->daddr, sel_cmp->family) && xfrm_addr_equal(&sel_tgt->saddr, &sel_cmp->saddr, sel_cmp->family) && sel_tgt->prefixlen_d == sel_cmp->prefixlen_d && sel_tgt->prefixlen_s == sel_cmp->prefixlen_s) { return true; } } else { if (memcmp(sel_tgt, sel_cmp, sizeof(*sel_tgt)) == 0) { return true; } } return false; } static struct xfrm_policy *xfrm_migrate_policy_find(const struct xfrm_selector *sel, u8 dir, u8 type, struct net *net) { struct xfrm_policy *pol, *ret = NULL; struct hlist_head *chain; u32 priority = ~0U; spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_direct(net, &sel->daddr, &sel->saddr, sel->family, dir); hlist_for_each_entry(pol, chain, bydst) { if (xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; priority = ret->priority; break; } } chain = &net->xfrm.policy_inexact[dir]; hlist_for_each_entry(pol, chain, bydst_inexact_list) { if ((pol->priority >= priority) && ret) break; if (xfrm_migrate_selector_match(sel, &pol->selector) && pol->type == type) { ret = pol; break; } } xfrm_pol_hold(ret); spin_unlock_bh(&net->xfrm.xfrm_policy_lock); return ret; } static int migrate_tmpl_match(const struct xfrm_migrate *m, const struct xfrm_tmpl *t) { int match = 0; if (t->mode == m->mode && t->id.proto == m->proto && (m->reqid == 0 || t->reqid == m->reqid)) { switch (t->mode) { case XFRM_MODE_TUNNEL: case XFRM_MODE_BEET: if (xfrm_addr_equal(&t->id.daddr, &m->old_daddr, m->old_family) && xfrm_addr_equal(&t->saddr, &m->old_saddr, m->old_family)) { match = 1; } break; case XFRM_MODE_TRANSPORT: /* in case of transport mode, template does not store any IP addresses, hence we just compare mode and protocol */ match = 1; break; default: break; } } return match; } /* update endpoint address(es) of template(s) */ static int xfrm_policy_migrate(struct xfrm_policy *pol, struct xfrm_migrate *m, int num_migrate) { struct xfrm_migrate *mp; int i, j, n = 0; write_lock_bh(&pol->lock); if (unlikely(pol->walk.dead)) { /* target policy has been deleted */ write_unlock_bh(&pol->lock); return -ENOENT; } for (i = 0; i < pol->xfrm_nr; i++) { for (j = 0, mp = m; j < num_migrate; j++, mp++) { if (!migrate_tmpl_match(mp, &pol->xfrm_vec[i])) continue; n++; if (pol->xfrm_vec[i].mode != XFRM_MODE_TUNNEL && pol->xfrm_vec[i].mode != XFRM_MODE_BEET) continue; /* update endpoints */ memcpy(&pol->xfrm_vec[i].id.daddr, &mp->new_daddr, sizeof(pol->xfrm_vec[i].id.daddr)); memcpy(&pol->xfrm_vec[i].saddr, &mp->new_saddr, sizeof(pol->xfrm_vec[i].saddr)); pol->xfrm_vec[i].encap_family = mp->new_family; /* flush bundles */ atomic_inc(&pol->genid); } } write_unlock_bh(&pol->lock); if (!n) return -ENODATA; return 0; } static int xfrm_migrate_check(const struct xfrm_migrate *m, int num_migrate) { int i, j; if (num_migrate < 1 || num_migrate > XFRM_MAX_DEPTH) return -EINVAL; for (i = 0; i < num_migrate; i++) { if (xfrm_addr_any(&m[i].new_daddr, m[i].new_family) || xfrm_addr_any(&m[i].new_saddr, m[i].new_family)) return -EINVAL; /* check if there is any duplicated entry */ for (j = i + 1; j < num_migrate; j++) { if (!memcmp(&m[i].old_daddr, &m[j].old_daddr, sizeof(m[i].old_daddr)) && !memcmp(&m[i].old_saddr, &m[j].old_saddr, sizeof(m[i].old_saddr)) && m[i].proto == m[j].proto && m[i].mode == m[j].mode && m[i].reqid == m[j].reqid && m[i].old_family == m[j].old_family) return -EINVAL; } } return 0; } int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_migrate *m, int num_migrate, struct xfrm_kmaddress *k, struct net *net, struct xfrm_encap_tmpl *encap) { int i, err, nx_cur = 0, nx_new = 0; struct xfrm_policy *pol = NULL; struct xfrm_state *x, *xc; struct xfrm_state *x_cur[XFRM_MAX_DEPTH]; struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; /* Stage 0 - sanity checks */ if ((err = xfrm_migrate_check(m, num_migrate)) < 0) goto out; if (dir >= XFRM_POLICY_MAX) { err = -EINVAL; goto out; } /* Stage 1 - find policy */ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { err = -ENOENT; goto out; } /* Stage 2 - find and update state(s) */ for (i = 0, mp = m; i < num_migrate; i++, mp++) { if ((x = xfrm_migrate_state_find(mp, net))) { x_cur[nx_cur] = x; nx_cur++; xc = xfrm_state_migrate(x, mp, encap); if (xc) { x_new[nx_new] = xc; nx_new++; } else { err = -ENODATA; goto restore_state; } } } /* Stage 3 - update policy */ if ((err = xfrm_policy_migrate(pol, m, num_migrate)) < 0) goto restore_state; /* Stage 4 - delete old state(s) */ if (nx_cur) { xfrm_states_put(x_cur, nx_cur); xfrm_states_delete(x_cur, nx_cur); } /* Stage 5 - announce */ km_migrate(sel, dir, type, m, num_migrate, k, encap); xfrm_pol_put(pol); return 0; out: return err; restore_state: if (pol) xfrm_pol_put(pol); if (nx_cur) xfrm_states_put(x_cur, nx_cur); if (nx_new) xfrm_states_delete(x_new, nx_new); return err; } EXPORT_SYMBOL(xfrm_migrate); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 /* SPDX-License-Identifier: GPL-2.0 */ /* * NUMA memory policies for Linux. * Copyright 2003,2004 Andi Kleen SuSE Labs */ #ifndef _LINUX_MEMPOLICY_H #define _LINUX_MEMPOLICY_H 1 #include <linux/sched.h> #include <linux/mmzone.h> #include <linux/dax.h> #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/spinlock.h> #include <linux/nodemask.h> #include <linux/pagemap.h> #include <uapi/linux/mempolicy.h> struct mm_struct; #ifdef CONFIG_NUMA /* * Describe a memory policy. * * A mempolicy can be either associated with a process or with a VMA. * For VMA related allocations the VMA policy is preferred, otherwise * the process policy is used. Interrupts ignore the memory policy * of the current process. * * Locking policy for interleave: * In process context there is no locking because only the process accesses * its own state. All vma manipulation is somewhat protected by a down_read on * mmap_lock. * * Freeing policy: * Mempolicy objects are reference counted. A mempolicy will be freed when * mpol_put() decrements the reference count to zero. * * Duplicating policy objects: * mpol_dup() allocates a new mempolicy and copies the specified mempolicy * to the new storage. The reference count of the new object is initialized * to 1, representing the caller of mpol_dup(). */ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ union { short preferred_node; /* preferred */ nodemask_t nodes; /* interleave/bind */ /* undefined for default */ } v; union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ nodemask_t user_nodemask; /* nodemask passed by user */ } w; }; /* * Support for managing mempolicy data objects (clone, copy, destroy) * The default fast path of a NULL MPOL_DEFAULT policy is always inlined. */ extern void __mpol_put(struct mempolicy *pol); static inline void mpol_put(struct mempolicy *pol) { if (pol) __mpol_put(pol); } /* * Does mempolicy pol need explicit unref after use? * Currently only needed for shared policies. */ static inline int mpol_needs_cond_ref(struct mempolicy *pol) { return (pol && (pol->flags & MPOL_F_SHARED)); } static inline void mpol_cond_put(struct mempolicy *pol) { if (mpol_needs_cond_ref(pol)) __mpol_put(pol); } extern struct mempolicy *__mpol_dup(struct mempolicy *pol); static inline struct mempolicy *mpol_dup(struct mempolicy *pol) { if (pol) pol = __mpol_dup(pol); return pol; } #define vma_policy(vma) ((vma)->vm_policy) static inline void mpol_get(struct mempolicy *pol) { if (pol) atomic_inc(&pol->refcnt); } extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b); static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { if (a == b) return true; return __mpol_equal(a, b); } /* * Tree of shared policies for a shared memory region. * Maintain the policies in a pseudo mm that contains vmas. The vmas * carry the policy. As a special twist the pseudo mm is indexed in pages, not * bytes, so that we can work with shared memory segments bigger than * unsigned long. */ struct sp_node { struct rb_node nd; unsigned long start, end; struct mempolicy *policy; }; struct shared_policy { struct rb_root root; rwlock_t lock; }; int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst); void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol); int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, struct mempolicy *new); void mpol_free_shared_policy(struct shared_policy *p); struct mempolicy *mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx); struct mempolicy *get_task_policy(struct task_struct *p); struct mempolicy *__get_vma_policy(struct vm_area_struct *vma, unsigned long addr); bool vma_policy_mof(struct vm_area_struct *vma); extern void numa_default_policy(void); extern void numa_policy_init(void); extern void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new); extern void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new); extern int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask); extern bool init_nodemask_of_mempolicy(nodemask_t *mask); extern bool mempolicy_nodemask_intersects(struct task_struct *tsk, const nodemask_t *mask); extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); static inline nodemask_t *policy_nodemask_current(gfp_t gfp) { struct mempolicy *mpol = get_task_policy(current); return policy_nodemask(gfp, mpol); } extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; static inline void check_highest_zone(enum zone_type k) { if (k > policy_zone && k != ZONE_MOVABLE) policy_zone = k; } int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags); #ifdef CONFIG_TMPFS extern int mpol_parse_str(char *str, struct mempolicy **mpol); #endif extern void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol); /* Check if a vma is migratable */ extern bool vma_migratable(struct vm_area_struct *vma); extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); extern void mpol_put_task_policy(struct task_struct *); #else struct mempolicy {}; static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b) { return true; } static inline void mpol_put(struct mempolicy *p) { } static inline void mpol_cond_put(struct mempolicy *pol) { } static inline void mpol_get(struct mempolicy *pol) { } struct shared_policy {}; static inline void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { } static inline void mpol_free_shared_policy(struct shared_policy *p) { } static inline struct mempolicy * mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) { return NULL; } #define vma_policy(vma) NULL static inline int vma_dup_policy(struct vm_area_struct *src, struct vm_area_struct *dst) { return 0; } static inline void numa_policy_init(void) { } static inline void numa_default_policy(void) { } static inline void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) { } static inline void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { } static inline int huge_node(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, nodemask_t **nodemask) { *mpol = NULL; *nodemask = NULL; return 0; } static inline bool init_nodemask_of_mempolicy(nodemask_t *m) { return false; } static inline int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, int flags) { return 0; } static inline void check_highest_zone(int k) { } #ifdef CONFIG_TMPFS static inline int mpol_parse_str(char *str, struct mempolicy **mpol) { return 1; /* error */ } #endif static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long address) { return -1; /* no node preference */ } static inline void mpol_put_task_policy(struct task_struct *task) { } static inline nodemask_t *policy_nodemask_current(gfp_t gfp) { return NULL; } #endif /* CONFIG_NUMA */ #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 /* This file is automatically generated. Do not edit. */ #ifndef _SELINUX_FLASK_H_ #define _SELINUX_FLASK_H_ #define SECCLASS_SECURITY 1 #define SECCLASS_PROCESS 2 #define SECCLASS_PROCESS2 3 #define SECCLASS_SYSTEM 4 #define SECCLASS_CAPABILITY 5 #define SECCLASS_FILESYSTEM 6 #define SECCLASS_FILE 7 #define SECCLASS_DIR 8 #define SECCLASS_FD 9 #define SECCLASS_LNK_FILE 10 #define SECCLASS_CHR_FILE 11 #define SECCLASS_BLK_FILE 12 #define SECCLASS_SOCK_FILE 13 #define SECCLASS_FIFO_FILE 14 #define SECCLASS_SOCKET 15 #define SECCLASS_TCP_SOCKET 16 #define SECCLASS_UDP_SOCKET 17 #define SECCLASS_RAWIP_SOCKET 18 #define SECCLASS_NODE 19 #define SECCLASS_NETIF 20 #define SECCLASS_NETLINK_SOCKET 21 #define SECCLASS_PACKET_SOCKET 22 #define SECCLASS_KEY_SOCKET 23 #define SECCLASS_UNIX_STREAM_SOCKET 24 #define SECCLASS_UNIX_DGRAM_SOCKET 25 #define SECCLASS_SEM 26 #define SECCLASS_MSG 27 #define SECCLASS_MSGQ 28 #define SECCLASS_SHM 29 #define SECCLASS_IPC 30 #define SECCLASS_NETLINK_ROUTE_SOCKET 31 #define SECCLASS_NETLINK_TCPDIAG_SOCKET 32 #define SECCLASS_NETLINK_NFLOG_SOCKET 33 #define SECCLASS_NETLINK_XFRM_SOCKET 34 #define SECCLASS_NETLINK_SELINUX_SOCKET 35 #define SECCLASS_NETLINK_ISCSI_SOCKET 36 #define SECCLASS_NETLINK_AUDIT_SOCKET 37 #define SECCLASS_NETLINK_FIB_LOOKUP_SOCKET 38 #define SECCLASS_NETLINK_CONNECTOR_SOCKET 39 #define SECCLASS_NETLINK_NETFILTER_SOCKET 40 #define SECCLASS_NETLINK_DNRT_SOCKET 41 #define SECCLASS_ASSOCIATION 42 #define SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET 43 #define SECCLASS_NETLINK_GENERIC_SOCKET 44 #define SECCLASS_NETLINK_SCSITRANSPORT_SOCKET 45 #define SECCLASS_NETLINK_RDMA_SOCKET 46 #define SECCLASS_NETLINK_CRYPTO_SOCKET 47 #define SECCLASS_APPLETALK_SOCKET 48 #define SECCLASS_PACKET 49 #define SECCLASS_KEY 50 #define SECCLASS_DCCP_SOCKET 51 #define SECCLASS_MEMPROTECT 52 #define SECCLASS_PEER 53 #define SECCLASS_CAPABILITY2 54 #define SECCLASS_KERNEL_SERVICE 55 #define SECCLASS_TUN_SOCKET 56 #define SECCLASS_BINDER 57 #define SECCLASS_CAP_USERNS 58 #define SECCLASS_CAP2_USERNS 59 #define SECCLASS_SCTP_SOCKET 60 #define SECCLASS_ICMP_SOCKET 61 #define SECCLASS_AX25_SOCKET 62 #define SECCLASS_IPX_SOCKET 63 #define SECCLASS_NETROM_SOCKET 64 #define SECCLASS_ATMPVC_SOCKET 65 #define SECCLASS_X25_SOCKET 66 #define SECCLASS_ROSE_SOCKET 67 #define SECCLASS_DECNET_SOCKET 68 #define SECCLASS_ATMSVC_SOCKET 69 #define SECCLASS_RDS_SOCKET 70 #define SECCLASS_IRDA_SOCKET 71 #define SECCLASS_PPPOX_SOCKET 72 #define SECCLASS_LLC_SOCKET 73 #define SECCLASS_CAN_SOCKET 74 #define SECCLASS_TIPC_SOCKET 75 #define SECCLASS_BLUETOOTH_SOCKET 76 #define SECCLASS_IUCV_SOCKET 77 #define SECCLASS_RXRPC_SOCKET 78 #define SECCLASS_ISDN_SOCKET 79 #define SECCLASS_PHONET_SOCKET 80 #define SECCLASS_IEEE802154_SOCKET 81 #define SECCLASS_CAIF_SOCKET 82 #define SECCLASS_ALG_SOCKET 83 #define SECCLASS_NFC_SOCKET 84 #define SECCLASS_VSOCK_SOCKET 85 #define SECCLASS_KCM_SOCKET 86 #define SECCLASS_QIPCRTR_SOCKET 87 #define SECCLASS_SMC_SOCKET 88 #define SECCLASS_INFINIBAND_PKEY 89 #define SECCLASS_INFINIBAND_ENDPORT 90 #define SECCLASS_BPF 91 #define SECCLASS_XDP_SOCKET 92 #define SECCLASS_PERF_EVENT 93 #define SECCLASS_LOCKDOWN 94 #define SECINITSID_KERNEL 1 #define SECINITSID_SECURITY 2 #define SECINITSID_UNLABELED 3 #define SECINITSID_FILE 5 #define SECINITSID_ANY_SOCKET 8 #define SECINITSID_PORT 9 #define SECINITSID_NETIF 10 #define SECINITSID_NETMSG 11 #define SECINITSID_NODE 12 #define SECINITSID_DEVNULL 27 #define SECINITSID_NUM 27 static inline bool security_is_socket_class(u16 kern_tclass) { bool sock = false; switch (kern_tclass) { case SECCLASS_SOCKET: case SECCLASS_TCP_SOCKET: case SECCLASS_UDP_SOCKET: case SECCLASS_RAWIP_SOCKET: case SECCLASS_NETLINK_SOCKET: case SECCLASS_PACKET_SOCKET: case SECCLASS_KEY_SOCKET: case SECCLASS_UNIX_STREAM_SOCKET: case SECCLASS_UNIX_DGRAM_SOCKET: case SECCLASS_NETLINK_ROUTE_SOCKET: case SECCLASS_NETLINK_TCPDIAG_SOCKET: case SECCLASS_NETLINK_NFLOG_SOCKET: case SECCLASS_NETLINK_XFRM_SOCKET: case SECCLASS_NETLINK_SELINUX_SOCKET: case SECCLASS_NETLINK_ISCSI_SOCKET: case SECCLASS_NETLINK_AUDIT_SOCKET: case SECCLASS_NETLINK_FIB_LOOKUP_SOCKET: case SECCLASS_NETLINK_CONNECTOR_SOCKET: case SECCLASS_NETLINK_NETFILTER_SOCKET: case SECCLASS_NETLINK_DNRT_SOCKET: case SECCLASS_NETLINK_KOBJECT_UEVENT_SOCKET: case SECCLASS_NETLINK_GENERIC_SOCKET: case SECCLASS_NETLINK_SCSITRANSPORT_SOCKET: case SECCLASS_NETLINK_RDMA_SOCKET: case SECCLASS_NETLINK_CRYPTO_SOCKET: case SECCLASS_APPLETALK_SOCKET: case SECCLASS_DCCP_SOCKET: case SECCLASS_TUN_SOCKET: case SECCLASS_SCTP_SOCKET: case SECCLASS_ICMP_SOCKET: case SECCLASS_AX25_SOCKET: case SECCLASS_IPX_SOCKET: case SECCLASS_NETROM_SOCKET: case SECCLASS_ATMPVC_SOCKET: case SECCLASS_X25_SOCKET: case SECCLASS_ROSE_SOCKET: case SECCLASS_DECNET_SOCKET: case SECCLASS_ATMSVC_SOCKET: case SECCLASS_RDS_SOCKET: case SECCLASS_IRDA_SOCKET: case SECCLASS_PPPOX_SOCKET: case SECCLASS_LLC_SOCKET: case SECCLASS_CAN_SOCKET: case SECCLASS_TIPC_SOCKET: case SECCLASS_BLUETOOTH_SOCKET: case SECCLASS_IUCV_SOCKET: case SECCLASS_RXRPC_SOCKET: case SECCLASS_ISDN_SOCKET: case SECCLASS_PHONET_SOCKET: case SECCLASS_IEEE802154_SOCKET: case SECCLASS_CAIF_SOCKET: case SECCLASS_ALG_SOCKET: case SECCLASS_NFC_SOCKET: case SECCLASS_VSOCK_SOCKET: case SECCLASS_KCM_SOCKET: case SECCLASS_QIPCRTR_SOCKET: case SECCLASS_SMC_SOCKET: case SECCLASS_XDP_SOCKET: sock = true; break; default: break; } return sock; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_JIFFIES_H #define _LINUX_JIFFIES_H #include <linux/cache.h> #include <linux/limits.h> #include <linux/math64.h> #include <linux/minmax.h> #include <linux/types.h> #include <linux/time.h> #include <linux/timex.h> #include <vdso/jiffies.h> #include <asm/param.h> /* for HZ */ #include <generated/timeconst.h> /* * The following defines establish the engineering parameters of the PLL * model. The HZ variable establishes the timer interrupt frequency, 100 Hz * for the SunOS kernel, 256 Hz for the Ultrix kernel and 1024 Hz for the * OSF/1 kernel. The SHIFT_HZ define expresses the same value as the * nearest power of two in order to avoid hardware multiply operations. */ #if HZ >= 12 && HZ < 24 # define SHIFT_HZ 4 #elif HZ >= 24 && HZ < 48 # define SHIFT_HZ 5 #elif HZ >= 48 && HZ < 96 # define SHIFT_HZ 6 #elif HZ >= 96 && HZ < 192 # define SHIFT_HZ 7 #elif HZ >= 192 && HZ < 384 # define SHIFT_HZ 8 #elif HZ >= 384 && HZ < 768 # define SHIFT_HZ 9 #elif HZ >= 768 && HZ < 1536 # define SHIFT_HZ 10 #elif HZ >= 1536 && HZ < 3072 # define SHIFT_HZ 11 #elif HZ >= 3072 && HZ < 6144 # define SHIFT_HZ 12 #elif HZ >= 6144 && HZ < 12288 # define SHIFT_HZ 13 #else # error Invalid value of HZ. #endif /* Suppose we want to divide two numbers NOM and DEN: NOM/DEN, then we can * improve accuracy by shifting LSH bits, hence calculating: * (NOM << LSH) / DEN * This however means trouble for large NOM, because (NOM << LSH) may no * longer fit in 32 bits. The following way of calculating this gives us * some slack, under the following conditions: * - (NOM / DEN) fits in (32 - LSH) bits. * - (NOM % DEN) fits in (32 - LSH) bits. */ #define SH_DIV(NOM,DEN,LSH) ( (((NOM) / (DEN)) << (LSH)) \ + ((((NOM) % (DEN)) << (LSH)) + (DEN) / 2) / (DEN)) /* LATCH is used in the interval timer and ftape setup. */ #define LATCH ((CLOCK_TICK_RATE + HZ/2) / HZ) /* For divider */ extern int register_refined_jiffies(long clock_tick_rate); /* TICK_USEC is the time between ticks in usec assuming SHIFTED_HZ */ #define TICK_USEC ((USEC_PER_SEC + HZ/2) / HZ) /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) #ifndef __jiffy_arch_data #define __jiffy_arch_data #endif /* * The 64-bit value is not atomic - you MUST NOT read it * without sampling the sequence number in jiffies_lock. * get_jiffies_64() will do this for you as appropriate. */ extern u64 __cacheline_aligned_in_smp jiffies_64; extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); #else static inline u64 get_jiffies_64(void) { return (u64)jiffies; } #endif /* * These inlines deal with timer wrapping correctly. You are * strongly encouraged to use them * 1. Because people otherwise forget * 2. Because if the timer wrap changes in future you won't have to * alter your driver code. * * time_after(a,b) returns true if the time a is after time b. * * Do this with "<0" and ">=0" to only test the sign of the result. A * good compiler would generate better code (and a really good compiler * wouldn't care). Gcc is currently neither. */ #define time_after(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((b) - (a)) < 0)) #define time_before(a,b) time_after(b,a) #define time_after_eq(a,b) \ (typecheck(unsigned long, a) && \ typecheck(unsigned long, b) && \ ((long)((a) - (b)) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) /* * Calculate whether a is in the range of [b, c]. */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) /* * Calculate whether a is in the range of [b, c). */ #define time_in_range_open(a,b,c) \ (time_after_eq(a,b) && \ time_before(a,c)) /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64() */ #define time_after64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((b) - (a)) < 0)) #define time_before64(a,b) time_after64(b,a) #define time_after_eq64(a,b) \ (typecheck(__u64, a) && \ typecheck(__u64, b) && \ ((__s64)((a) - (b)) >= 0)) #define time_before_eq64(a,b) time_after_eq64(b,a) #define time_in_range64(a, b, c) \ (time_after_eq64(a, b) && \ time_before_eq64(a, c)) /* * These four macros compare jiffies and 'a' for convenience. */ /* time_is_before_jiffies(a) return true if a is before jiffies */ #define time_is_before_jiffies(a) time_after(jiffies, a) #define time_is_before_jiffies64(a) time_after64(get_jiffies_64(), a) /* time_is_after_jiffies(a) return true if a is after jiffies */ #define time_is_after_jiffies(a) time_before(jiffies, a) #define time_is_after_jiffies64(a) time_before64(get_jiffies_64(), a) /* time_is_before_eq_jiffies(a) return true if a is before or equal to jiffies*/ #define time_is_before_eq_jiffies(a) time_after_eq(jiffies, a) #define time_is_before_eq_jiffies64(a) time_after_eq64(get_jiffies_64(), a) /* time_is_after_eq_jiffies(a) return true if a is after or equal to jiffies*/ #define time_is_after_eq_jiffies(a) time_before_eq(jiffies, a) #define time_is_after_eq_jiffies64(a) time_before_eq64(get_jiffies_64(), a) /* * Have the 32 bit jiffies value wrap 5 minutes after boot * so jiffies wrap bugs show up earlier. */ #define INITIAL_JIFFIES ((unsigned long)(unsigned int) (-300*HZ)) /* * Change timeval to jiffies, trying to avoid the * most obvious overflows.. * * And some not so obvious. * * Note that we don't want to return LONG_MAX, because * for various timeout reasons we often end up having * to wait "jiffies+1" in order to guarantee that we wait * at _least_ "jiffies" - so "jiffies+1" had better still * be positive. */ #define MAX_JIFFY_OFFSET ((LONG_MAX >> 1)-1) extern unsigned long preset_lpj; /* * We want to do realistic conversions of time so we need to use the same * values the update wall clock code uses as the jiffies size. This value * is: TICK_NSEC (which is defined in timex.h). This * is a constant and is in nanoseconds. We will use scaled math * with a set of scales defined here as SEC_JIFFIE_SC, USEC_JIFFIE_SC and * NSEC_JIFFIE_SC. Note that these defines contain nothing but * constants and so are computed at compile time. SHIFT_HZ (computed in * timex.h) adjusts the scaling for different HZ values. * Scaled math??? What is that? * * Scaled math is a way to do integer math on values that would, * otherwise, either overflow, underflow, or cause undesired div * instructions to appear in the execution path. In short, we "scale" * up the operands so they take more bits (more precision, less * underflow), do the desired operation and then "scale" the result back * by the same amount. If we do the scaling by shifting we avoid the * costly mpy and the dastardly div instructions. * Suppose, for example, we want to convert from seconds to jiffies * where jiffies is defined in nanoseconds as NSEC_PER_JIFFIE. The * simple math is: jiff = (sec * NSEC_PER_SEC) / NSEC_PER_JIFFIE; We * observe that (NSEC_PER_SEC / NSEC_PER_JIFFIE) is a constant which we * might calculate at compile time, however, the result will only have * about 3-4 bits of precision (less for smaller values of HZ). * * So, we scale as follows: * jiff = (sec) * (NSEC_PER_SEC / NSEC_PER_JIFFIE); * jiff = ((sec) * ((NSEC_PER_SEC * SCALE)/ NSEC_PER_JIFFIE)) / SCALE; * Then we make SCALE a power of two so: * jiff = ((sec) * ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) >> SCALE; * Now we define: * #define SEC_CONV = ((NSEC_PER_SEC << SCALE)/ NSEC_PER_JIFFIE)) * jiff = (sec * SEC_CONV) >> SCALE; * * Often the math we use will expand beyond 32-bits so we tell C how to * do this and pass the 64-bit result of the mpy through the ">> SCALE" * which should take the result back to 32-bits. We want this expansion * to capture as much precision as possible. At the same time we don't * want to overflow so we pick the SCALE to avoid this. In this file, * that means using a different scale for each range of HZ values (as * defined in timex.h). * * For those who want to know, gcc will give a 64-bit result from a "*" * operator if the result is a long long AND at least one of the * operands is cast to long long (usually just prior to the "*" so as * not to confuse it into thinking it really has a 64-bit operand, * which, buy the way, it can do, but it takes more code and at least 2 * mpys). * We also need to be aware that one second in nanoseconds is only a * couple of bits away from overflowing a 32-bit word, so we MUST use * 64-bits to get the full range time in nanoseconds. */ /* * Here are the scales we will use. One for seconds, nanoseconds and * microseconds. * * Within the limits of cpp we do a rough cut at the SEC_JIFFIE_SC and * check if the sign bit is set. If not, we bump the shift count by 1. * (Gets an extra bit of precision where we can use it.) * We know it is set for HZ = 1024 and HZ = 100 not for 1000. * Haven't tested others. * Limits of cpp (for #if expressions) only long (no long long), but * then we only need the most signicant bit. */ #define SEC_JIFFIE_SC (31 - SHIFT_HZ) #if !((((NSEC_PER_SEC << 2) / TICK_NSEC) << (SEC_JIFFIE_SC - 2)) & 0x80000000) #undef SEC_JIFFIE_SC #define SEC_JIFFIE_SC (32 - SHIFT_HZ) #endif #define NSEC_JIFFIE_SC (SEC_JIFFIE_SC + 29) #define SEC_CONVERSION ((unsigned long)((((u64)NSEC_PER_SEC << SEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) #define NSEC_CONVERSION ((unsigned long)((((u64)1 << NSEC_JIFFIE_SC) +\ TICK_NSEC -1) / (u64)TICK_NSEC)) /* * The maximum jiffie value is (MAX_INT >> 1). Here we translate that * into seconds. The 64-bit case will overflow if we are not careful, * so use the messy SH_DIV macro to do it. Still all constants. */ #if BITS_PER_LONG < 64 # define MAX_SEC_IN_JIFFIES \ (long)((u64)((u64)MAX_JIFFY_OFFSET * TICK_NSEC) / NSEC_PER_SEC) #else /* take care of overflow on 64 bits machines */ # define MAX_SEC_IN_JIFFIES \ (SH_DIV((MAX_JIFFY_OFFSET >> SEC_JIFFIE_SC) * TICK_NSEC, NSEC_PER_SEC, 1) - 1) #endif /* * Convert various time units to each other: */ extern unsigned int jiffies_to_msecs(const unsigned long j); extern unsigned int jiffies_to_usecs(const unsigned long j); static inline u64 jiffies_to_nsecs(const unsigned long j) { return (u64)jiffies_to_usecs(j) * NSEC_PER_USEC; } extern u64 jiffies64_to_nsecs(u64 j); extern u64 jiffies64_to_msecs(u64 j); extern unsigned long __msecs_to_jiffies(const unsigned int m); #if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) /* * HZ is equal to or smaller than 1000, and 1000 is a nice round * multiple of HZ, divide with the factor between them, but round * upwards: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); } #elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) /* * HZ is larger than 1000, and HZ is a nice round multiple of 1000 - * simply multiply with the factor between them. * * But first make sure the multiplication result cannot overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return m * (HZ / MSEC_PER_SEC); } #else /* * Generic case - multiply, round and divide. But first check that if * we are doing a net multiplication, that we wouldn't overflow: */ static inline unsigned long _msecs_to_jiffies(const unsigned int m) { if (HZ > MSEC_PER_SEC && m > jiffies_to_msecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return (MSEC_TO_HZ_MUL32 * m + MSEC_TO_HZ_ADJ32) >> MSEC_TO_HZ_SHR32; } #endif /** * msecs_to_jiffies: - convert milliseconds to jiffies * @m: time in milliseconds * * conversion is done as follows: * * - negative values mean 'infinite timeout' (MAX_JIFFY_OFFSET) * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows. * for the details see __msecs_to_jiffies() * * msecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __msecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * the HZ range specific helpers _msecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. */ static __always_inline unsigned long msecs_to_jiffies(const unsigned int m) { if (__builtin_constant_p(m)) { if ((int)m < 0) return MAX_JIFFY_OFFSET; return _msecs_to_jiffies(m); } else { return __msecs_to_jiffies(m); } } extern unsigned long __usecs_to_jiffies(const unsigned int u); #if !(USEC_PER_SEC % HZ) static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (u + (USEC_PER_SEC / HZ) - 1) / (USEC_PER_SEC / HZ); } #else static inline unsigned long _usecs_to_jiffies(const unsigned int u) { return (USEC_TO_HZ_MUL32 * u + USEC_TO_HZ_ADJ32) >> USEC_TO_HZ_SHR32; } #endif /** * usecs_to_jiffies: - convert microseconds to jiffies * @u: time in microseconds * * conversion is done as follows: * * - 'too large' values [that would result in larger than * MAX_JIFFY_OFFSET values] mean 'infinite timeout' too. * * - all other values are converted to jiffies by either multiplying * the input value by a factor or dividing it with a factor and * handling any 32-bit overflows as for msecs_to_jiffies. * * usecs_to_jiffies() checks for the passed in value being a constant * via __builtin_constant_p() allowing gcc to eliminate most of the * code, __usecs_to_jiffies() is called if the value passed does not * allow constant folding and the actual conversion must be done at * runtime. * the HZ range specific helpers _usecs_to_jiffies() are called both * directly here and from __msecs_to_jiffies() in the case where * constant folding is not possible. */ static __always_inline unsigned long usecs_to_jiffies(const unsigned int u) { if (__builtin_constant_p(u)) { if (u > jiffies_to_usecs(MAX_JIFFY_OFFSET)) return MAX_JIFFY_OFFSET; return _usecs_to_jiffies(u); } else { return __usecs_to_jiffies(u); } } extern unsigned long timespec64_to_jiffies(const struct timespec64 *value); extern void jiffies_to_timespec64(const unsigned long jiffies, struct timespec64 *value); extern clock_t jiffies_to_clock_t(unsigned long x); static inline clock_t jiffies_delta_to_clock_t(long delta) { return jiffies_to_clock_t(max(0L, delta)); } static inline unsigned int jiffies_delta_to_msecs(long delta) { return jiffies_to_msecs(max(0L, delta)); } extern unsigned long clock_t_to_jiffies(unsigned long x); extern u64 jiffies_64_to_clock_t(u64 x); extern u64 nsec_to_clock_t(u64 x); extern u64 nsecs_to_jiffies64(u64 n); extern unsigned long nsecs_to_jiffies(u64 n); #define TIMESTAMP_SIZE 30 #endif
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * VLAN An implementation of 802.1Q VLAN tagging. * * Authors: Ben Greear <greearb@candelatech.com> */ #ifndef _LINUX_IF_VLAN_H_ #define _LINUX_IF_VLAN_H_ #include <linux/netdevice.h> #include <linux/etherdevice.h> #include <linux/rtnetlink.h> #include <linux/bug.h> #include <uapi/linux/if_vlan.h> #define VLAN_HLEN 4 /* The additional bytes required by VLAN * (in addition to the Ethernet header) */ #define VLAN_ETH_HLEN 18 /* Total octets in header. */ #define VLAN_ETH_ZLEN 64 /* Min. octets in frame sans FCS */ /* * According to 802.3ac, the packet can be 4 bytes longer. --Klika Jan */ #define VLAN_ETH_DATA_LEN 1500 /* Max. octets in payload */ #define VLAN_ETH_FRAME_LEN 1518 /* Max. octets in frame sans FCS */ #define VLAN_MAX_DEPTH 8 /* Max. number of nested VLAN tags parsed */ /* * struct vlan_hdr - vlan header * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_hdr { __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; /** * struct vlan_ethhdr - vlan ethernet header (ethhdr + vlan_hdr) * @h_dest: destination ethernet address * @h_source: source ethernet address * @h_vlan_proto: ethernet protocol * @h_vlan_TCI: priority and VLAN ID * @h_vlan_encapsulated_proto: packet type ID or len */ struct vlan_ethhdr { unsigned char h_dest[ETH_ALEN]; unsigned char h_source[ETH_ALEN]; __be16 h_vlan_proto; __be16 h_vlan_TCI; __be16 h_vlan_encapsulated_proto; }; #include <linux/skbuff.h> static inline struct vlan_ethhdr *vlan_eth_hdr(const struct sk_buff *skb) { return (struct vlan_ethhdr *)skb_mac_header(skb); } #define VLAN_PRIO_MASK 0xe000 /* Priority Code Point */ #define VLAN_PRIO_SHIFT 13 #define VLAN_CFI_MASK 0x1000 /* Canonical Format Indicator / Drop Eligible Indicator */ #define VLAN_VID_MASK 0x0fff /* VLAN Identifier */ #define VLAN_N_VID 4096 /* found in socket.c */ extern void vlan_ioctl_set(int (*hook)(struct net *, void __user *)); static inline bool is_vlan_dev(const struct net_device *dev) { return dev->priv_flags & IFF_802_1Q_VLAN; } #define skb_vlan_tag_present(__skb) ((__skb)->vlan_present) #define skb_vlan_tag_get(__skb) ((__skb)->vlan_tci) #define skb_vlan_tag_get_id(__skb) ((__skb)->vlan_tci & VLAN_VID_MASK) #define skb_vlan_tag_get_cfi(__skb) (!!((__skb)->vlan_tci & VLAN_CFI_MASK)) #define skb_vlan_tag_get_prio(__skb) (((__skb)->vlan_tci & VLAN_PRIO_MASK) >> VLAN_PRIO_SHIFT) static inline int vlan_get_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_CVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_ctag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_CVLAN_FILTER_DROP_INFO, dev); } static inline int vlan_get_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); return notifier_to_errno(call_netdevice_notifiers(NETDEV_SVLAN_FILTER_PUSH_INFO, dev)); } static inline void vlan_drop_rx_stag_filter_info(struct net_device *dev) { ASSERT_RTNL(); call_netdevice_notifiers(NETDEV_SVLAN_FILTER_DROP_INFO, dev); } /** * struct vlan_pcpu_stats - VLAN percpu rx/tx stats * @rx_packets: number of received packets * @rx_bytes: number of received bytes * @rx_multicast: number of received multicast packets * @tx_packets: number of transmitted packets * @tx_bytes: number of transmitted bytes * @syncp: synchronization point for 64bit counters * @rx_errors: number of rx errors * @tx_dropped: number of tx drops */ struct vlan_pcpu_stats { u64 rx_packets; u64 rx_bytes; u64 rx_multicast; u64 tx_packets; u64 tx_bytes; struct u64_stats_sync syncp; u32 rx_errors; u32 tx_dropped; }; #if defined(CONFIG_VLAN_8021Q) || defined(CONFIG_VLAN_8021Q_MODULE) extern struct net_device *__vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id); extern int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg); extern struct net_device *vlan_dev_real_dev(const struct net_device *dev); extern u16 vlan_dev_vlan_id(const struct net_device *dev); extern __be16 vlan_dev_vlan_proto(const struct net_device *dev); /** * struct vlan_priority_tci_mapping - vlan egress priority mappings * @priority: skb priority * @vlan_qos: vlan priority: (skb->priority << 13) & 0xE000 * @next: pointer to next struct */ struct vlan_priority_tci_mapping { u32 priority; u16 vlan_qos; struct vlan_priority_tci_mapping *next; }; struct proc_dir_entry; struct netpoll; /** * struct vlan_dev_priv - VLAN private device data * @nr_ingress_mappings: number of ingress priority mappings * @ingress_priority_map: ingress priority mappings * @nr_egress_mappings: number of egress priority mappings * @egress_priority_map: hash of egress priority mappings * @vlan_proto: VLAN encapsulation protocol * @vlan_id: VLAN identifier * @flags: device flags * @real_dev: underlying netdevice * @real_dev_addr: address of underlying netdevice * @dent: proc dir entry * @vlan_pcpu_stats: ptr to percpu rx stats */ struct vlan_dev_priv { unsigned int nr_ingress_mappings; u32 ingress_priority_map[8]; unsigned int nr_egress_mappings; struct vlan_priority_tci_mapping *egress_priority_map[16]; __be16 vlan_proto; u16 vlan_id; u16 flags; struct net_device *real_dev; unsigned char real_dev_addr[ETH_ALEN]; struct proc_dir_entry *dent; struct vlan_pcpu_stats __percpu *vlan_pcpu_stats; #ifdef CONFIG_NET_POLL_CONTROLLER struct netpoll *netpoll; #endif }; static inline struct vlan_dev_priv *vlan_dev_priv(const struct net_device *dev) { return netdev_priv(dev); } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { struct vlan_priority_tci_mapping *mp; smp_rmb(); /* coupled with smp_wmb() in vlan_dev_set_egress_priority() */ mp = vlan_dev_priv(dev)->egress_priority_map[(skprio & 0xF)]; while (mp) { if (mp->priority == skprio) { return mp->vlan_qos; /* This should already be shifted * to mask correctly with the * VLAN's TCI */ } mp = mp->next; } return 0; } extern bool vlan_do_receive(struct sk_buff **skb); extern int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid); extern void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid); extern int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev); extern void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev); extern bool vlan_uses_dev(const struct net_device *dev); #else static inline struct net_device * __vlan_find_dev_deep_rcu(struct net_device *real_dev, __be16 vlan_proto, u16 vlan_id) { return NULL; } static inline int vlan_for_each(struct net_device *dev, int (*action)(struct net_device *dev, int vid, void *arg), void *arg) { return 0; } static inline struct net_device *vlan_dev_real_dev(const struct net_device *dev) { BUG(); return NULL; } static inline u16 vlan_dev_vlan_id(const struct net_device *dev) { BUG(); return 0; } static inline __be16 vlan_dev_vlan_proto(const struct net_device *dev) { BUG(); return 0; } static inline u16 vlan_dev_get_egress_qos_mask(struct net_device *dev, u32 skprio) { return 0; } static inline bool vlan_do_receive(struct sk_buff **skb) { return false; } static inline int vlan_vid_add(struct net_device *dev, __be16 proto, u16 vid) { return 0; } static inline void vlan_vid_del(struct net_device *dev, __be16 proto, u16 vid) { } static inline int vlan_vids_add_by_dev(struct net_device *dev, const struct net_device *by_dev) { return 0; } static inline void vlan_vids_del_by_dev(struct net_device *dev, const struct net_device *by_dev) { } static inline bool vlan_uses_dev(const struct net_device *dev) { return false; } #endif /** * eth_type_vlan - check for valid vlan ether type. * @ethertype: ether type to check * * Returns true if the ether type is a vlan ether type. */ static inline bool eth_type_vlan(__be16 ethertype) { switch (ethertype) { case htons(ETH_P_8021Q): case htons(ETH_P_8021AD): return true; default: return false; } } static inline bool vlan_hw_offload_capable(netdev_features_t features, __be16 proto) { if (proto == htons(ETH_P_8021Q) && features & NETIF_F_HW_VLAN_CTAG_TX) return true; if (proto == htons(ETH_P_8021AD) && features & NETIF_F_HW_VLAN_STAG_TX) return true; return false; } /** * __vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns error if skb_cow_head fails. * * Does not change skb->protocol so this function can be used during receive. */ static inline int __vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { struct vlan_ethhdr *veth; if (skb_cow_head(skb, VLAN_HLEN) < 0) return -ENOMEM; skb_push(skb, VLAN_HLEN); /* Move the mac header sans proto to the beginning of the new header. */ if (likely(mac_len > ETH_TLEN)) memmove(skb->data, skb->data + VLAN_HLEN, mac_len - ETH_TLEN); skb->mac_header -= VLAN_HLEN; veth = (struct vlan_ethhdr *)(skb->data + mac_len - ETH_HLEN); /* first, the ethernet type */ if (likely(mac_len >= ETH_TLEN)) { /* h_vlan_encapsulated_proto should already be populated, and * skb->data has space for h_vlan_proto */ veth->h_vlan_proto = vlan_proto; } else { /* h_vlan_encapsulated_proto should not be populated, and * skb->data has no space for h_vlan_proto */ veth->h_vlan_encapsulated_proto = skb->protocol; } /* now, the TCI */ veth->h_vlan_TCI = htons(vlan_tci); return 0; } /** * __vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns error if skb_cow_head fails. * * Does not change skb->protocol so this function can be used during receive. */ static inline int __vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_inner_tag - inner VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * @mac_len: MAC header length including outer vlan headers * * Inserts the VLAN tag into @skb as part of the payload at offset mac_len * Returns a VLAN tagged skb. If a new skb is created, @skb is freed. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. */ static inline struct sk_buff *vlan_insert_inner_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci, unsigned int mac_len) { int err; err = __vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, mac_len); if (err) { dev_kfree_skb_any(skb); return NULL; } return skb; } /** * vlan_insert_tag - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. If a new skb is created, @skb is freed. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. * * Does not change skb->protocol so this function can be used during receive. */ static inline struct sk_buff *vlan_insert_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { return vlan_insert_inner_tag(skb, vlan_proto, vlan_tci, ETH_HLEN); } /** * vlan_insert_tag_set_proto - regular VLAN tag inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Inserts the VLAN tag into @skb as part of the payload * Returns a VLAN tagged skb. If a new skb is created, @skb is freed. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. */ static inline struct sk_buff *vlan_insert_tag_set_proto(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb = vlan_insert_tag(skb, vlan_proto, vlan_tci); if (skb) skb->protocol = vlan_proto; return skb; } /** * __vlan_hwaccel_clear_tag - clear hardware accelerated VLAN info * @skb: skbuff to clear * * Clears the VLAN information from @skb */ static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb) { skb->vlan_present = 0; } /** * __vlan_hwaccel_copy_tag - copy hardware accelerated VLAN info from another skb * @dst: skbuff to copy to * @src: skbuff to copy from * * Copies VLAN information from @src to @dst (for branchless code) */ static inline void __vlan_hwaccel_copy_tag(struct sk_buff *dst, const struct sk_buff *src) { dst->vlan_present = src->vlan_present; dst->vlan_proto = src->vlan_proto; dst->vlan_tci = src->vlan_tci; } /* * __vlan_hwaccel_push_inside - pushes vlan tag to the payload * @skb: skbuff to tag * * Pushes the VLAN tag from @skb->vlan_tci inside to the payload. * * Following the skb_unshare() example, in case of error, the calling function * doesn't have to worry about freeing the original skb. */ static inline struct sk_buff *__vlan_hwaccel_push_inside(struct sk_buff *skb) { skb = vlan_insert_tag_set_proto(skb, skb->vlan_proto, skb_vlan_tag_get(skb)); if (likely(skb)) __vlan_hwaccel_clear_tag(skb); return skb; } /** * __vlan_hwaccel_put_tag - hardware accelerated VLAN inserting * @skb: skbuff to tag * @vlan_proto: VLAN encapsulation protocol * @vlan_tci: VLAN TCI to insert * * Puts the VLAN TCI in @skb->vlan_tci and lets the device do the rest */ static inline void __vlan_hwaccel_put_tag(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) { skb->vlan_proto = vlan_proto; skb->vlan_tci = vlan_tci; skb->vlan_present = 1; } /** * __vlan_get_tag - get the VLAN ID that is part of the payload * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns error if the skb is not of VLAN type */ static inline int __vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { struct vlan_ethhdr *veth = (struct vlan_ethhdr *)skb->data; if (!eth_type_vlan(veth->h_vlan_proto)) return -EINVAL; *vlan_tci = ntohs(veth->h_vlan_TCI); return 0; } /** * __vlan_hwaccel_get_tag - get the VLAN ID that is in @skb->cb[] * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns error if @skb->vlan_tci is not set correctly */ static inline int __vlan_hwaccel_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb_vlan_tag_present(skb)) { *vlan_tci = skb_vlan_tag_get(skb); return 0; } else { *vlan_tci = 0; return -EINVAL; } } /** * vlan_get_tag - get the VLAN ID from the skb * @skb: skbuff to query * @vlan_tci: buffer to store value * * Returns error if the skb is not VLAN tagged */ static inline int vlan_get_tag(const struct sk_buff *skb, u16 *vlan_tci) { if (skb->dev->features & NETIF_F_HW_VLAN_CTAG_TX) { return __vlan_hwaccel_get_tag(skb, vlan_tci); } else { return __vlan_get_tag(skb, vlan_tci); } } /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * @type: first vlan protocol * @depth: buffer to store length of eth and vlan tags in bytes * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 __vlan_get_protocol(const struct sk_buff *skb, __be16 type, int *depth) { unsigned int vlan_depth = skb->mac_len, parse_depth = VLAN_MAX_DEPTH; /* if type is 802.1Q/AD then the header should already be * present at mac_len - VLAN_HLEN (if mac_len > 0), or at * ETH_HLEN otherwise */ if (eth_type_vlan(type)) { if (vlan_depth) { if (WARN_ON(vlan_depth < VLAN_HLEN)) return 0; vlan_depth -= VLAN_HLEN; } else { vlan_depth = ETH_HLEN; } do { struct vlan_hdr vhdr, *vh; vh = skb_header_pointer(skb, vlan_depth, sizeof(vhdr), &vhdr); if (unlikely(!vh || !--parse_depth)) return 0; type = vh->h_vlan_encapsulated_proto; vlan_depth += VLAN_HLEN; } while (eth_type_vlan(type)); } if (depth) *depth = vlan_depth; return type; } /** * vlan_get_protocol - get protocol EtherType. * @skb: skbuff to query * * Returns the EtherType of the packet, regardless of whether it is * vlan encapsulated (normal or hardware accelerated) or not. */ static inline __be16 vlan_get_protocol(const struct sk_buff *skb) { return __vlan_get_protocol(skb, skb->protocol, NULL); } /* A getter for the SKB protocol field which will handle VLAN tags consistently * whether VLAN acceleration is enabled or not. */ static inline __be16 skb_protocol(const struct sk_buff *skb, bool skip_vlan) { if (!skip_vlan) /* VLAN acceleration strips the VLAN header from the skb and * moves it to skb->vlan_proto */ return skb_vlan_tag_present(skb) ? skb->vlan_proto : skb->protocol; return vlan_get_protocol(skb); } static inline void vlan_set_encap_proto(struct sk_buff *skb, struct vlan_hdr *vhdr) { __be16 proto; unsigned short *rawp; /* * Was a VLAN packet, grab the encapsulated protocol, which the layer * three protocols care about. */ proto = vhdr->h_vlan_encapsulated_proto; if (eth_proto_is_802_3(proto)) { skb->protocol = proto; return; } rawp = (unsigned short *)(vhdr + 1); if (*rawp == 0xFFFF) /* * This is a magic hack to spot IPX packets. Older Novell * breaks the protocol design and runs IPX over 802.3 without * an 802.2 LLC layer. We look for FFFF which isn't a used * 802.2 SSAP/DSAP. This won't work for fault tolerant netware * but does for the rest. */ skb->protocol = htons(ETH_P_802_3); else /* * Real 802.2 LLC */ skb->protocol = htons(ETH_P_802_2); } /** * skb_vlan_tagged - check if skb is vlan tagged. * @skb: skbuff to query * * Returns true if the skb is tagged, regardless of whether it is hardware * accelerated or not. */ static inline bool skb_vlan_tagged(const struct sk_buff *skb) { if (!skb_vlan_tag_present(skb) && likely(!eth_type_vlan(skb->protocol))) return false; return true; } /** * skb_vlan_tagged_multi - check if skb is vlan tagged with multiple headers. * @skb: skbuff to query * * Returns true if the skb is tagged with multiple vlan headers, regardless * of whether it is hardware accelerated or not. */ static inline bool skb_vlan_tagged_multi(struct sk_buff *skb) { __be16 protocol = skb->protocol; if (!skb_vlan_tag_present(skb)) { struct vlan_ethhdr *veh; if (likely(!eth_type_vlan(protocol))) return false; if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) return false; veh = (struct vlan_ethhdr *)skb->data; protocol = veh->h_vlan_encapsulated_proto; } if (!eth_type_vlan(protocol)) return false; return true; } /** * vlan_features_check - drop unsafe features for skb with multiple tags. * @skb: skbuff to query * @features: features to be checked * * Returns features without unsafe ones if the skb has multiple tags. */ static inline netdev_features_t vlan_features_check(struct sk_buff *skb, netdev_features_t features) { if (skb_vlan_tagged_multi(skb)) { /* In the case of multi-tagged packets, use a direct mask * instead of using netdev_interesect_features(), to make * sure that only devices supporting NETIF_F_HW_CSUM will * have checksum offloading support. */ features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_HW_CSUM | NETIF_F_FRAGLIST | NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX; } return features; } /** * compare_vlan_header - Compare two vlan headers * @h1: Pointer to vlan header * @h2: Pointer to vlan header * * Compare two vlan headers, returns 0 if equal. * * Please note that alignment of h1 & h2 are only guaranteed to be 16 bits. */ static inline unsigned long compare_vlan_header(const struct vlan_hdr *h1, const struct vlan_hdr *h2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) return *(u32 *)h1 ^ *(u32 *)h2; #else return ((__force u32)h1->h_vlan_TCI ^ (__force u32)h2->h_vlan_TCI) | ((__force u32)h1->h_vlan_encapsulated_proto ^ (__force u32)h2->h_vlan_encapsulated_proto); #endif } #endif /* !(_LINUX_IF_VLAN_H_) */
1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MM_H #define _LINUX_MM_H #include <linux/errno.h> #ifdef __KERNEL__ #include <linux/mmdebug.h> #include <linux/gfp.h> #include <linux/bug.h> #include <linux/list.h> #include <linux/mmzone.h> #include <linux/rbtree.h> #include <linux/atomic.h> #include <linux/debug_locks.h> #include <linux/mm_types.h> #include <linux/mmap_lock.h> #include <linux/range.h> #include <linux/pfn.h> #include <linux/percpu-refcount.h> #include <linux/bit_spinlock.h> #include <linux/shrinker.h> #include <linux/resource.h> #include <linux/page_ext.h> #include <linux/err.h> #include <linux/page-flags.h> #include <linux/page_ref.h> #include <linux/memremap.h> #include <linux/overflow.h> #include <linux/sizes.h> #include <linux/sched.h> #include <linux/pgtable.h> struct mempolicy; struct anon_vma; struct anon_vma_chain; struct file_ra_state; struct user_struct; struct writeback_control; struct bdi_writeback; struct pt_regs; extern int sysctl_page_lock_unfairness; void init_mm_internals(void); #ifndef CONFIG_NEED_MULTIPLE_NODES /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; static inline void set_max_mapnr(unsigned long limit) { max_mapnr = limit; } #else static inline void set_max_mapnr(unsigned long limit) { } #endif extern atomic_long_t _totalram_pages; static inline unsigned long totalram_pages(void) { return (unsigned long)atomic_long_read(&_totalram_pages); } static inline void totalram_pages_inc(void) { atomic_long_inc(&_totalram_pages); } static inline void totalram_pages_dec(void) { atomic_long_dec(&_totalram_pages); } static inline void totalram_pages_add(long count) { atomic_long_add(count, &_totalram_pages); } extern void * high_memory; extern int page_cluster; #ifdef CONFIG_SYSCTL extern int sysctl_legacy_va_layout; #else #define sysctl_legacy_va_layout 0 #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS extern const int mmap_rnd_bits_min; extern const int mmap_rnd_bits_max; extern int mmap_rnd_bits __read_mostly; #endif #ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS extern const int mmap_rnd_compat_bits_min; extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif #include <asm/page.h> #include <asm/processor.h> /* * Architectures that support memory tagging (assigning tags to memory regions, * embedding these tags into addresses that point to these memory regions, and * checking that the memory and the pointer tags match on memory accesses) * redefine this macro to strip tags from pointers. * It's defined as noop for arcitectures that don't support memory tagging. */ #ifndef untagged_addr #define untagged_addr(addr) (addr) #endif #ifndef __pa_symbol #define __pa_symbol(x) __pa(RELOC_HIDE((unsigned long)(x), 0)) #endif #ifndef page_to_virt #define page_to_virt(x) __va(PFN_PHYS(page_to_pfn(x))) #endif #ifndef lm_alias #define lm_alias(x) __va(__pa_symbol(x)) #endif /* * To prevent common memory management code establishing * a zero page mapping on a read fault. * This macro should be defined within <asm/pgtable.h>. * s390 does this to prevent multiplexing of hardware bits * related to the physical page in case of virtualization. */ #ifndef mm_forbids_zeropage #define mm_forbids_zeropage(X) (0) #endif /* * On some architectures it is expensive to call memset() for small sizes. * If an architecture decides to implement their own version of * mm_zero_struct_page they should wrap the defines below in a #ifndef and * define their own version of this macro in <asm/pgtable.h> */ #if BITS_PER_LONG == 64 /* This function must be updated when the size of struct page grows above 80 * or reduces below 56. The idea that compiler optimizes out switch() * statement, and only leaves move/store instructions. Also the compiler can * combine write statments if they are both assignments and can be reordered, * this can result in several of the writes here being dropped. */ #define mm_zero_struct_page(pp) __mm_zero_struct_page(pp) static inline void __mm_zero_struct_page(struct page *page) { unsigned long *_pp = (void *)page; /* Check that struct page is either 56, 64, 72, or 80 bytes */ BUILD_BUG_ON(sizeof(struct page) & 7); BUILD_BUG_ON(sizeof(struct page) < 56); BUILD_BUG_ON(sizeof(struct page) > 80); switch (sizeof(struct page)) { case 80: _pp[9] = 0; fallthrough; case 72: _pp[8] = 0; fallthrough; case 64: _pp[7] = 0; fallthrough; case 56: _pp[6] = 0; _pp[5] = 0; _pp[4] = 0; _pp[3] = 0; _pp[2] = 0; _pp[1] = 0; _pp[0] = 0; } } #else #define mm_zero_struct_page(pp) ((void)memset((pp), 0, sizeof(struct page))) #endif /* * Default maximum number of active map areas, this limits the number of vmas * per mm struct. Users can overwrite this number by sysctl but there is a * problem. * * When a program's coredump is generated as ELF format, a section is created * per a vma. In ELF, the number of sections is represented in unsigned short. * This means the number of sections should be smaller than 65535 at coredump. * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. * * ELF extended numbering allows more than 65535 sections, so 16-bit bound is * not a hard limit any more. Although some userspace tools can be surprised by * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) extern int sysctl_max_map_count; extern unsigned long sysctl_user_reserve_kbytes; extern unsigned long sysctl_admin_reserve_kbytes; extern int sysctl_overcommit_memory; extern int sysctl_overcommit_ratio; extern unsigned long sysctl_overcommit_kbytes; int overcommit_ratio_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_kbytes_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int overcommit_policy_handler(struct ctl_table *, int, void *, size_t *, loff_t *); #define nth_page(page,n) pfn_to_page(page_to_pfn((page)) + (n)) /* to align the pointer to the (next) page boundary */ #define PAGE_ALIGN(addr) ALIGN(addr, PAGE_SIZE) /* test whether an address (unsigned long or pointer) is aligned to PAGE_SIZE */ #define PAGE_ALIGNED(addr) IS_ALIGNED((unsigned long)(addr), PAGE_SIZE) #define lru_to_page(head) (list_entry((head)->prev, struct page, lru)) /* * Linux kernel virtual memory manager primitives. * The idea being to have a "virtual" mm in the same way * we have a virtual fs - giving a cleaner interface to the * mm details, and allowing different kinds of memory mappings * (from shared memory to executable loading to arbitrary * mmap() functions). */ struct vm_area_struct *vm_area_alloc(struct mm_struct *); struct vm_area_struct *vm_area_dup(struct vm_area_struct *); void vm_area_free(struct vm_area_struct *); #ifndef CONFIG_MMU extern struct rb_root nommu_region_tree; extern struct rw_semaphore nommu_region_sem; extern unsigned int kobjsize(const void *objp); #endif /* * vm_flags in vm_area_struct, see mm_types.h. * When changing, update also include/trace/events/mmflags.h */ #define VM_NONE 0x00000000 #define VM_READ 0x00000001 /* currently active flags */ #define VM_WRITE 0x00000002 #define VM_EXEC 0x00000004 #define VM_SHARED 0x00000008 /* mprotect() hardcodes VM_MAYREAD >> 4 == VM_READ, and so for r/w/x bits. */ #define VM_MAYREAD 0x00000010 /* limits for mprotect() etc */ #define VM_MAYWRITE 0x00000020 #define VM_MAYEXEC 0x00000040 #define VM_MAYSHARE 0x00000080 #define VM_GROWSDOWN 0x00000100 /* general info on the segment */ #define VM_UFFD_MISSING 0x00000200 /* missing pages tracking */ #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ #define VM_UFFD_WP 0x00001000 /* wrprotect pages tracking */ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 /* Memory mapped I/O or similar */ /* Used by sys_madvise() */ #define VM_SEQ_READ 0x00008000 /* App will access data sequentially */ #define VM_RAND_READ 0x00010000 /* App will not benefit from clustered reads */ #define VM_DONTCOPY 0x00020000 /* Do not copy this vma on fork */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_LOCKONFAULT 0x00080000 /* Lock the pages covered when they are faulted in */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_SYNC 0x00800000 /* Synchronous page faults */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #ifdef CONFIG_MEM_SOFT_DIRTY # define VM_SOFTDIRTY 0x08000000 /* Not soft dirty clean area */ #else # define VM_SOFTDIRTY 0 #endif #define VM_MIXEDMAP 0x10000000 /* Can contain "struct page" and pure PFN pages */ #define VM_HUGEPAGE 0x20000000 /* MADV_HUGEPAGE marked this vma */ #define VM_NOHUGEPAGE 0x40000000 /* MADV_NOHUGEPAGE marked this vma */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ #ifdef CONFIG_ARCH_USES_HIGH_VMA_FLAGS #define VM_HIGH_ARCH_BIT_0 32 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_1 33 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_2 34 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_3 35 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_BIT_4 36 /* bit only usable on 64-bit architectures */ #define VM_HIGH_ARCH_0 BIT(VM_HIGH_ARCH_BIT_0) #define VM_HIGH_ARCH_1 BIT(VM_HIGH_ARCH_BIT_1) #define VM_HIGH_ARCH_2 BIT(VM_HIGH_ARCH_BIT_2) #define VM_HIGH_ARCH_3 BIT(VM_HIGH_ARCH_BIT_3) #define VM_HIGH_ARCH_4 BIT(VM_HIGH_ARCH_BIT_4) #endif /* CONFIG_ARCH_USES_HIGH_VMA_FLAGS */ #ifdef CONFIG_ARCH_HAS_PKEYS # define VM_PKEY_SHIFT VM_HIGH_ARCH_BIT_0 # define VM_PKEY_BIT0 VM_HIGH_ARCH_0 /* A protection key is a 4-bit value */ # define VM_PKEY_BIT1 VM_HIGH_ARCH_1 /* on x86 and 5-bit value on ppc64 */ # define VM_PKEY_BIT2 VM_HIGH_ARCH_2 # define VM_PKEY_BIT3 VM_HIGH_ARCH_3 #ifdef CONFIG_PPC # define VM_PKEY_BIT4 VM_HIGH_ARCH_4 #else # define VM_PKEY_BIT4 0 #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ #if defined(CONFIG_X86) # define VM_PAT VM_ARCH_1 /* PAT reserves whole VMA at once (x86) */ #elif defined(CONFIG_PPC) # define VM_SAO VM_ARCH_1 /* Strong Access Ordering (powerpc) */ #elif defined(CONFIG_PARISC) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_IA64) # define VM_GROWSUP VM_ARCH_1 #elif defined(CONFIG_SPARC64) # define VM_SPARC_ADI VM_ARCH_1 /* Uses ADI tag for access control */ # define VM_ARCH_CLEAR VM_SPARC_ADI #elif defined(CONFIG_ARM64) # define VM_ARM64_BTI VM_ARCH_1 /* BTI guarded page, a.k.a. GP bit */ # define VM_ARCH_CLEAR VM_ARM64_BTI #elif !defined(CONFIG_MMU) # define VM_MAPPED_COPY VM_ARCH_1 /* T if mapped copy of data (nommu mmap) */ #endif #if defined(CONFIG_ARM64_MTE) # define VM_MTE VM_HIGH_ARCH_0 /* Use Tagged memory for access control */ # define VM_MTE_ALLOWED VM_HIGH_ARCH_1 /* Tagged memory permitted */ #else # define VM_MTE VM_NONE # define VM_MTE_ALLOWED VM_NONE #endif #ifndef VM_GROWSUP # define VM_GROWSUP VM_NONE #endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) #define TASK_EXEC ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0) /* Common data flag combinations */ #define VM_DATA_FLAGS_TSK_EXEC (VM_READ | VM_WRITE | TASK_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_NON_EXEC (VM_READ | VM_WRITE | VM_MAYREAD | \ VM_MAYWRITE | VM_MAYEXEC) #define VM_DATA_FLAGS_EXEC (VM_READ | VM_WRITE | VM_EXEC | \ VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) #ifndef VM_DATA_DEFAULT_FLAGS /* arch can override this */ #define VM_DATA_DEFAULT_FLAGS VM_DATA_FLAGS_EXEC #endif #ifndef VM_STACK_DEFAULT_FLAGS /* arch can override this */ #define VM_STACK_DEFAULT_FLAGS VM_DATA_DEFAULT_FLAGS #endif #ifdef CONFIG_STACK_GROWSUP #define VM_STACK VM_GROWSUP #else #define VM_STACK VM_GROWSDOWN #endif #define VM_STACK_FLAGS (VM_STACK | VM_STACK_DEFAULT_FLAGS | VM_ACCOUNT) /* VMA basic access permission flags */ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) /* * Special vmas that are non-mergable, non-mlock()able. */ #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) /* This mask prevents VMA from being scanned with khugepaged */ #define VM_NO_KHUGEPAGED (VM_SPECIAL | VM_HUGETLB) /* This mask defines which mm->def_flags a process can inherit its parent */ #define VM_INIT_DEF_MASK VM_NOHUGEPAGE /* This mask is used to clear all the VMA flags used by mlock */ #define VM_LOCKED_CLEAR_MASK (~(VM_LOCKED | VM_LOCKONFAULT)) /* Arch-specific flags to clear when updating VM flags on protection change */ #ifndef VM_ARCH_CLEAR # define VM_ARCH_CLEAR VM_NONE #endif #define VM_FLAGS_CLEAR (ARCH_VM_PKEY_FLAGS | VM_ARCH_CLEAR) /* * mapping from the currently active vm_flags protection bits (the * low four bits) to a page protection mask.. */ extern pgprot_t protection_map[16]; /** * Fault flag definitions. * * @FAULT_FLAG_WRITE: Fault was a write fault. * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. * @FAULT_FLAG_TRIED: The fault has been tried once. * @FAULT_FLAG_USER: The fault originated in userspace. * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. * * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify * whether we would allow page faults to retry by specifying these two * fault flags correctly. Currently there can be three legal combinations: * * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and * this is the first try * * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and * we've already tried at least once * * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry * * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never * be used. Note that page faults can be allowed to retry for multiple times, * in which case we'll have an initial fault with flags (a) then later on * continuous faults with flags (b). We should always try to detect pending * signals before a retry to make sure the continuous page faults can still be * interrupted if necessary. */ #define FAULT_FLAG_WRITE 0x01 #define FAULT_FLAG_MKWRITE 0x02 #define FAULT_FLAG_ALLOW_RETRY 0x04 #define FAULT_FLAG_RETRY_NOWAIT 0x08 #define FAULT_FLAG_KILLABLE 0x10 #define FAULT_FLAG_TRIED 0x20 #define FAULT_FLAG_USER 0x40 #define FAULT_FLAG_REMOTE 0x80 #define FAULT_FLAG_INSTRUCTION 0x100 #define FAULT_FLAG_INTERRUPTIBLE 0x200 /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. */ #define FAULT_FLAG_DEFAULT (FAULT_FLAG_ALLOW_RETRY | \ FAULT_FLAG_KILLABLE | \ FAULT_FLAG_INTERRUPTIBLE) /** * fault_flag_allow_retry_first - check ALLOW_RETRY the first time * * This is mostly used for places where we want to try to avoid taking * the mmap_lock for too long a time when waiting for another condition * to change, in which case we can try to be polite to release the * mmap_lock in the first round to avoid potential starvation of other * processes that would also want the mmap_lock. * * Return: true if the page fault allows retry and this is the first * attempt of the fault handling; false otherwise. */ static inline bool fault_flag_allow_retry_first(unsigned int flags) { return (flags & FAULT_FLAG_ALLOW_RETRY) && (!(flags & FAULT_FLAG_TRIED)); } #define FAULT_FLAG_TRACE \ { FAULT_FLAG_WRITE, "WRITE" }, \ { FAULT_FLAG_MKWRITE, "MKWRITE" }, \ { FAULT_FLAG_ALLOW_RETRY, "ALLOW_RETRY" }, \ { FAULT_FLAG_RETRY_NOWAIT, "RETRY_NOWAIT" }, \ { FAULT_FLAG_KILLABLE, "KILLABLE" }, \ { FAULT_FLAG_TRIED, "TRIED" }, \ { FAULT_FLAG_USER, "USER" }, \ { FAULT_FLAG_REMOTE, "REMOTE" }, \ { FAULT_FLAG_INSTRUCTION, "INSTRUCTION" }, \ { FAULT_FLAG_INTERRUPTIBLE, "INTERRUPTIBLE" } /* * vm_fault is filled by the pagefault handler and passed to the vma's * ->fault function. The vma's ->fault is responsible for returning a bitmask * of VM_FAULT_xxx flags that give details about how the fault was handled. * * MM layer fills up gfp_mask for page allocations but fault handler might * alter it if its implementation requires a different allocation context. * * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { struct vm_area_struct *vma; /* Target VMA */ unsigned int flags; /* FAULT_FLAG_xxx flags */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ pud_t *pud; /* Pointer to pud entry matching * the 'address' */ pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Page handler may use for COW fault */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by * VM_FAULT_ERROR). */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. */ spinlock_t *ptl; /* Page table lock. * Protects pte page table if 'pte' * is not NULL, otherwise pmd. */ pgtable_t prealloc_pte; /* Pre-allocated pte page table. * vm_ops->map_pages() calls * alloc_set_pte() from atomic context. * do_fault_around() pre-allocates * page table to avoid allocation from * atomic context. */ }; /* page entry size for vm->huge_fault() */ enum page_entry_size { PE_SIZE_PTE = 0, PE_SIZE_PMD, PE_SIZE_PUD, }; /* * These are the virtual MM functions - opening of an area, closing and * unmapping it (needed to keep files on disk up-to-date etc), pointer * to the functions called when a no-page or a wp-page exception occurs. */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); void (*close)(struct vm_area_struct * area); int (*split)(struct vm_area_struct * area, unsigned long addr); int (*mremap)(struct vm_area_struct * area); vm_fault_t (*fault)(struct vm_fault *vmf); vm_fault_t (*huge_fault)(struct vm_fault *vmf, enum page_entry_size pe_size); void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); unsigned long (*pagesize)(struct vm_area_struct * area); /* notification that a previously read-only page is about to become * writable, if an error is returned it will cause a SIGBUS */ vm_fault_t (*page_mkwrite)(struct vm_fault *vmf); /* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */ vm_fault_t (*pfn_mkwrite)(struct vm_fault *vmf); /* called by access_process_vm when get_user_pages() fails, typically * for use by special VMAs that can switch between memory and hardware */ int (*access)(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); /* Called by the /proc/PID/maps code to ask the vma whether it * has a special name. Returning non-NULL will also cause this * vma to be dumped unconditionally. */ const char *(*name)(struct vm_area_struct *vma); #ifdef CONFIG_NUMA /* * set_policy() op must add a reference to any non-NULL @new mempolicy * to hold the policy upon return. Caller should pass NULL @new to * remove a policy and fall back to surrounding context--i.e. do not * install a MPOL_DEFAULT policy, nor the task or system default * mempolicy. */ int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new); /* * get_policy() op must add reference [mpol_get()] to any policy at * (vma,addr) marked as MPOL_SHARED. The shared policy infrastructure * in mm/mempolicy.c will do this automatically. * get_policy() must NOT add a ref if the policy at (vma,addr) is not * marked as MPOL_SHARED. vma policies are protected by the mmap_lock. * If no [shared/vma] mempolicy exists at the addr, get_policy() op * must return NULL--i.e., do not "fallback" to task or system default * policy. */ struct mempolicy *(*get_policy)(struct vm_area_struct *vma, unsigned long addr); #endif /* * Called by vm_normal_page() for special PTEs to find the * page for @addr. This is useful if the default behavior * (using pte_page()) would not find the correct page. */ struct page *(*find_special_page)(struct vm_area_struct *vma, unsigned long addr); }; static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); } static inline void vma_set_anonymous(struct vm_area_struct *vma) { vma->vm_ops = NULL; } static inline bool vma_is_anonymous(struct vm_area_struct *vma) { return !vma->vm_ops; } static inline bool vma_is_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); if (!maybe_stack) return false; if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == VM_STACK_INCOMPLETE_SETUP) return true; return false; } static inline bool vma_is_foreign(struct vm_area_struct *vma) { if (!current->mm) return true; if (current->mm != vma->vm_mm) return true; return false; } static inline bool vma_is_accessible(struct vm_area_struct *vma) { return vma->vm_flags & VM_ACCESS_FLAGS; } #ifdef CONFIG_SHMEM /* * The vma_is_shmem is not inline because it is used only by slow * paths in userfault. */ bool vma_is_shmem(struct vm_area_struct *vma); #else static inline bool vma_is_shmem(struct vm_area_struct *vma) { return false; } #endif int vma_is_stack_for_current(struct vm_area_struct *vma); /* flush_tlb_range() takes a vma, not a mm, and can care about flags */ #define TLB_FLUSH_VMA(mm,flags) { .vm_mm = (mm), .vm_flags = (flags) } struct mmu_gather; struct inode; #include <linux/huge_mm.h> /* * Methods to modify the page usage count. * * What counts for a page usage: * - cache mapping (page->mapping) * - private data (page->private) * - page mapped in a task's page tables, each mapping * is counted separately * * Also, many kernel routines increase the page count before a critical * routine so they can be sure the page doesn't go away from under them. */ /* * Drop a ref, return true if the refcount fell to zero (the page has no users) */ static inline int put_page_testzero(struct page *page) { VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); return page_ref_dec_and_test(page); } /* * Try to grab a ref unless the page has a refcount of zero, return false if * that is the case. * This can be called when MMU is off so it must not access * any of the virtual mappings. */ static inline int get_page_unless_zero(struct page *page) { return page_ref_add_unless(page, 1, 0); } extern int page_is_ram(unsigned long pfn); enum { REGION_INTERSECTS, REGION_DISJOINT, REGION_MIXED, }; int region_intersects(resource_size_t offset, size_t size, unsigned long flags, unsigned long desc); /* Support for virtually mapped pages */ struct page *vmalloc_to_page(const void *addr); unsigned long vmalloc_to_pfn(const void *addr); /* * Determine if an address is within the vmalloc range * * On nommu, vmalloc/vfree wrap through kmalloc/kfree directly, so there * is no special casing required. */ #ifndef is_ioremap_addr #define is_ioremap_addr(x) is_vmalloc_addr(x) #endif #ifdef CONFIG_MMU extern bool is_vmalloc_addr(const void *x); extern int is_vmalloc_or_module_addr(const void *x); #else static inline bool is_vmalloc_addr(const void *x) { return false; } static inline int is_vmalloc_or_module_addr(const void *x) { return 0; } #endif extern void *kvmalloc_node(size_t size, gfp_t flags, int node); static inline void *kvmalloc(size_t size, gfp_t flags) { return kvmalloc_node(size, flags, NUMA_NO_NODE); } static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) { return kvmalloc_node(size, flags | __GFP_ZERO, node); } static inline void *kvzalloc(size_t size, gfp_t flags) { return kvmalloc(size, flags | __GFP_ZERO); } static inline void *kvmalloc_array(size_t n, size_t size, gfp_t flags) { size_t bytes; if (unlikely(check_mul_overflow(n, size, &bytes))) return NULL; return kvmalloc(bytes, flags); } static inline void *kvcalloc(size_t n, size_t size, gfp_t flags) { return kvmalloc_array(n, size, flags | __GFP_ZERO); } extern void kvfree(const void *addr); extern void kvfree_sensitive(const void *addr, size_t len); static inline int head_compound_mapcount(struct page *head) { return atomic_read(compound_mapcount_ptr(head)) + 1; } /* * Mapcount of compound page as a whole, does not include mapped sub-pages. * * Must be called only for compound pages or any their tail sub-pages. */ static inline int compound_mapcount(struct page *page) { VM_BUG_ON_PAGE(!PageCompound(page), page); page = compound_head(page); return head_compound_mapcount(page); } /* * The atomic page->_mapcount, starts from -1: so that transitions * both from it and to it can be tracked, using atomic_inc_and_test * and atomic_add_negative(-1). */ static inline void page_mapcount_reset(struct page *page) { atomic_set(&(page)->_mapcount, -1); } int __page_mapcount(struct page *page); /* * Mapcount of 0-order page; when compound sub-page, includes * compound_mapcount(). * * Result is undefined for pages which cannot be mapped into userspace. * For example SLAB or special types of pages. See function page_has_type(). * They use this place in struct page differently. */ static inline int page_mapcount(struct page *page) { if (unlikely(PageCompound(page))) return __page_mapcount(page); return atomic_read(&page->_mapcount) + 1; } #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); int page_trans_huge_mapcount(struct page *page, int *total_mapcount); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } static inline int page_trans_huge_mapcount(struct page *page, int *total_mapcount) { int mapcount = page_mapcount(page); if (total_mapcount) *total_mapcount = mapcount; return mapcount; } #endif static inline struct page *virt_to_head_page(const void *x) { struct page *page = virt_to_page(x); return compound_head(page); } void __put_page(struct page *page); void put_pages_list(struct list_head *pages); void split_page(struct page *page, unsigned int order); /* * Compound pages have a destructor function. Provide a * prototype for that function and accessor functions. * These are _only_ valid on the head of a compound page. */ typedef void compound_page_dtor(struct page *); /* Keep the enum in sync with compound_page_dtors array in mm/page_alloc.c */ enum compound_dtor_id { NULL_COMPOUND_DTOR, COMPOUND_PAGE_DTOR, #ifdef CONFIG_HUGETLB_PAGE HUGETLB_PAGE_DTOR, #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE TRANSHUGE_PAGE_DTOR, #endif NR_COMPOUND_DTORS, }; extern compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS]; static inline void set_compound_page_dtor(struct page *page, enum compound_dtor_id compound_dtor) { VM_BUG_ON_PAGE(compound_dtor >= NR_COMPOUND_DTORS, page); page[1].compound_dtor = compound_dtor; } static inline void destroy_compound_page(struct page *page) { VM_BUG_ON_PAGE(page[1].compound_dtor >= NR_COMPOUND_DTORS, page); compound_page_dtors[page[1].compound_dtor](page); } static inline unsigned int compound_order(struct page *page) { if (!PageHead(page)) return 0; return page[1].compound_order; } static inline bool hpage_pincount_available(struct page *page) { /* * Can the page->hpage_pinned_refcount field be used? That field is in * the 3rd page of the compound page, so the smallest (2-page) compound * pages cannot support it. */ page = compound_head(page); return PageCompound(page) && compound_order(page) > 1; } static inline int head_compound_pincount(struct page *head) { return atomic_read(compound_pincount_ptr(head)); } static inline int compound_pincount(struct page *page) { VM_BUG_ON_PAGE(!hpage_pincount_available(page), page); page = compound_head(page); return head_compound_pincount(page); } static inline void set_compound_order(struct page *page, unsigned int order) { page[1].compound_order = order; page[1].compound_nr = 1U << order; } /* Returns the number of pages in this potentially compound page. */ static inline unsigned long compound_nr(struct page *page) { if (!PageHead(page)) return 1; return page[1].compound_nr; } /* Returns the number of bytes in this potentially compound page. */ static inline unsigned long page_size(struct page *page) { return PAGE_SIZE << compound_order(page); } /* Returns the number of bits needed for the number of bytes in a page */ static inline unsigned int page_shift(struct page *page) { return PAGE_SHIFT + compound_order(page); } void free_compound_page(struct page *page); #ifdef CONFIG_MMU /* * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when * servicing faults for write access. In the normal case, do always want * pte_mkwrite. But get_user_pages can cause write faults for mappings * that do not have writing enabled, when used by access_process_vm. */ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) { if (likely(vma->vm_flags & VM_WRITE)) pte = pte_mkwrite(pte); return pte; } vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page); vm_fault_t finish_fault(struct vm_fault *vmf); vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf); #endif /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of * zeroes, and text pages of executables and shared libraries have * only one copy in memory, at most, normally. * * For the non-reserved pages, page_count(page) denotes a reference count. * page_count() == 0 means the page is free. page->lru is then used for * freelist management in the buddy allocator. * page_count() > 0 means the page has been allocated. * * Pages are allocated by the slab allocator in order to provide memory * to kmalloc and kmem_cache_alloc. In this case, the management of the * page, and the fields in 'struct page' are the responsibility of mm/slab.c * unless a particular usage is carefully commented. (the responsibility of * freeing the kmalloc memory is the caller's, of course). * * A page may be used by anyone else who does a __get_free_page(). * In this case, page_count still tracks the references, and should only * be used through the normal accessor functions. The top bits of page->flags * and page->virtual store page management information, but all other fields * are unused and could be used privately, carefully. The management of this * page is the responsibility of the one who allocated it, and those who have * subsequently been given references to it. * * The other pages (we may call them "pagecache pages") are completely * managed by the Linux memory manager: I/O, buffers, swapping etc. * The following discussion applies only to them. * * A pagecache page contains an opaque `private' member, which belongs to the * page's address_space. Usually, this is the address of a circular list of * the page's disk buffers. PG_private must be set to tell the VM to call * into the filesystem to release these pages. * * A page may belong to an inode's memory mapping. In this case, page->mapping * is the pointer to the inode, and page->index is the file offset of the page, * in units of PAGE_SIZE. * * If pagecache pages are not associated with an inode, they are said to be * anonymous pages. These may become associated with the swapcache, and in that * case PG_swapcache is set, and page->private is an offset into the swapcache. * * In either case (swapcache or inode backed), the pagecache itself holds one * reference to the page. Setting PG_private should also increment the * refcount. The each user mapping also has a reference to the page. * * The pagecache pages are stored in a per-mapping radix tree, which is * rooted at mapping->i_pages, and indexed by offset. * Where 2.4 and early 2.6 kernels kept dirty/clean pages in per-address_space * lists, we instead now tag pages as dirty/writeback in the radix tree. * * All pagecache pages may be subject to I/O: * - inode pages may need to be read from disk, * - inode pages which have been modified and are MAP_SHARED may need * to be written back to the inode on disk, * - anonymous pages (including MAP_PRIVATE file mappings) which have been * modified may need to be swapped out to swap space and (later) to be read * back into memory. */ /* * The zone field is never updated after free_area_init_core() * sets it, so none of the operations on it need to be atomic. */ /* Page flags: | [SECTION] | [NODE] | ZONE | [LAST_CPUPID] | ... | FLAGS | */ #define SECTIONS_PGOFF ((sizeof(unsigned long)*8) - SECTIONS_WIDTH) #define NODES_PGOFF (SECTIONS_PGOFF - NODES_WIDTH) #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) /* * Define the bit shifts to access each section. For non-existent * sections we define the shift as 0; that plus a 0 mask ensures * the compiler will optimise away reference to them. */ #define SECTIONS_PGSHIFT (SECTIONS_PGOFF * (SECTIONS_WIDTH != 0)) #define NODES_PGSHIFT (NODES_PGOFF * (NODES_WIDTH != 0)) #define ZONES_PGSHIFT (ZONES_PGOFF * (ZONES_WIDTH != 0)) #define LAST_CPUPID_PGSHIFT (LAST_CPUPID_PGOFF * (LAST_CPUPID_WIDTH != 0)) #define KASAN_TAG_PGSHIFT (KASAN_TAG_PGOFF * (KASAN_TAG_WIDTH != 0)) /* NODE:ZONE or SECTION:ZONE is used to ID a zone for the buddy allocator */ #ifdef NODE_NOT_IN_PAGE_FLAGS #define ZONEID_SHIFT (SECTIONS_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((SECTIONS_PGOFF < ZONES_PGOFF)? \ SECTIONS_PGOFF : ZONES_PGOFF) #else #define ZONEID_SHIFT (NODES_SHIFT + ZONES_SHIFT) #define ZONEID_PGOFF ((NODES_PGOFF < ZONES_PGOFF)? \ NODES_PGOFF : ZONES_PGOFF) #endif #define ZONEID_PGSHIFT (ZONEID_PGOFF * (ZONEID_SHIFT != 0)) #define ZONES_MASK ((1UL << ZONES_WIDTH) - 1) #define NODES_MASK ((1UL << NODES_WIDTH) - 1) #define SECTIONS_MASK ((1UL << SECTIONS_WIDTH) - 1) #define LAST_CPUPID_MASK ((1UL << LAST_CPUPID_SHIFT) - 1) #define KASAN_TAG_MASK ((1UL << KASAN_TAG_WIDTH) - 1) #define ZONEID_MASK ((1UL << ZONEID_SHIFT) - 1) static inline enum zone_type page_zonenum(const struct page *page) { ASSERT_EXCLUSIVE_BITS(page->flags, ZONES_MASK << ZONES_PGSHIFT); return (page->flags >> ZONES_PGSHIFT) & ZONES_MASK; } #ifdef CONFIG_ZONE_DEVICE static inline bool is_zone_device_page(const struct page *page) { return page_zonenum(page) == ZONE_DEVICE; } extern void memmap_init_zone_device(struct zone *, unsigned long, unsigned long, struct dev_pagemap *); #else static inline bool is_zone_device_page(const struct page *page) { return false; } #endif #ifdef CONFIG_DEV_PAGEMAP_OPS void free_devmap_managed_page(struct page *page); DECLARE_STATIC_KEY_FALSE(devmap_managed_key); static inline bool page_is_devmap_managed(struct page *page) { if (!static_branch_unlikely(&devmap_managed_key)) return false; if (!is_zone_device_page(page)) return false; switch (page->pgmap->type) { case MEMORY_DEVICE_PRIVATE: case MEMORY_DEVICE_FS_DAX: return true; default: break; } return false; } void put_devmap_managed_page(struct page *page); #else /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool page_is_devmap_managed(struct page *page) { return false; } static inline void put_devmap_managed_page(struct page *page) { } #endif /* CONFIG_DEV_PAGEMAP_OPS */ static inline bool is_device_private_page(const struct page *page) { return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && IS_ENABLED(CONFIG_DEVICE_PRIVATE) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PRIVATE; } static inline bool is_pci_p2pdma_page(const struct page *page) { return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && IS_ENABLED(CONFIG_PCI_P2PDMA) && is_zone_device_page(page) && page->pgmap->type == MEMORY_DEVICE_PCI_P2PDMA; } /* 127: arbitrary random number, small enough to assemble well */ #define page_ref_zero_or_close_to_overflow(page) \ ((unsigned int) page_ref_count(page) + 127u <= 127u) static inline void get_page(struct page *page) { page = compound_head(page); /* * Getting a normal page or the head of a compound page * requires to already have an elevated page->_refcount. */ VM_BUG_ON_PAGE(page_ref_zero_or_close_to_overflow(page), page); page_ref_inc(page); } bool __must_check try_grab_page(struct page *page, unsigned int flags); static inline __must_check bool try_get_page(struct page *page) { page = compound_head(page); if (WARN_ON_ONCE(page_ref_count(page) <= 0)) return false; page_ref_inc(page); return true; } static inline void put_page(struct page *page) { page = compound_head(page); /* * For devmap managed pages we need to catch refcount transition from * 2 to 1, when refcount reach one it means the page is free and we * need to inform the device driver through callback. See * include/linux/memremap.h and HMM for details. */ if (page_is_devmap_managed(page)) { put_devmap_managed_page(page); return; } if (put_page_testzero(page)) __put_page(page); } /* * GUP_PIN_COUNTING_BIAS, and the associated functions that use it, overload * the page's refcount so that two separate items are tracked: the original page * reference count, and also a new count of how many pin_user_pages() calls were * made against the page. ("gup-pinned" is another term for the latter). * * With this scheme, pin_user_pages() becomes special: such pages are marked as * distinct from normal pages. As such, the unpin_user_page() call (and its * variants) must be used in order to release gup-pinned pages. * * Choice of value: * * By making GUP_PIN_COUNTING_BIAS a power of two, debugging of page reference * counts with respect to pin_user_pages() and unpin_user_page() becomes * simpler, due to the fact that adding an even power of two to the page * refcount has the effect of using only the upper N bits, for the code that * counts up using the bias value. This means that the lower bits are left for * the exclusive use of the original code that increments and decrements by one * (or at least, by much smaller values than the bias value). * * Of course, once the lower bits overflow into the upper bits (and this is * OK, because subtraction recovers the original values), then visual inspection * no longer suffices to directly view the separate counts. However, for normal * applications that don't have huge page reference counts, this won't be an * issue. * * Locking: the lockless algorithm described in page_cache_get_speculative() * and page_cache_gup_pin_speculative() provides safe operation for * get_user_pages and page_mkclean and other calls that race to set up page * table entries. */ #define GUP_PIN_COUNTING_BIAS (1U << 10) void unpin_user_page(struct page *page); void unpin_user_pages_dirty_lock(struct page **pages, unsigned long npages, bool make_dirty); void unpin_user_pages(struct page **pages, unsigned long npages); /** * page_maybe_dma_pinned() - report if a page is pinned for DMA. * * This function checks if a page has been pinned via a call to * pin_user_pages*(). * * For non-huge pages, the return value is partially fuzzy: false is not fuzzy, * because it means "definitely not pinned for DMA", but true means "probably * pinned for DMA, but possibly a false positive due to having at least * GUP_PIN_COUNTING_BIAS worth of normal page references". * * False positives are OK, because: a) it's unlikely for a page to get that many * refcounts, and b) all the callers of this routine are expected to be able to * deal gracefully with a false positive. * * For huge pages, the result will be exactly correct. That's because we have * more tracking data available: the 3rd struct page in the compound page is * used to track the pincount (instead using of the GUP_PIN_COUNTING_BIAS * scheme). * * For more information, please see Documentation/core-api/pin_user_pages.rst. * * @page: pointer to page to be queried. * @Return: True, if it is likely that the page has been "dma-pinned". * False, if the page is definitely not dma-pinned. */ static inline bool page_maybe_dma_pinned(struct page *page) { if (hpage_pincount_available(page)) return compound_pincount(page) > 0; /* * page_ref_count() is signed. If that refcount overflows, then * page_ref_count() returns a negative value, and callers will avoid * further incrementing the refcount. * * Here, for that overflow case, use the signed bit to count a little * bit higher via unsigned math, and thus still get an accurate result. */ return ((unsigned int)page_ref_count(compound_head(page))) >= GUP_PIN_COUNTING_BIAS; } #if defined(CONFIG_SPARSEMEM) && !defined(CONFIG_SPARSEMEM_VMEMMAP) #define SECTION_IN_PAGE_FLAGS #endif /* * The identification function is mainly used by the buddy allocator for * determining if two pages could be buddies. We are not really identifying * the zone since we could be using the section number id if we do not have * node id available in page flags. * We only guarantee that it will return the same value for two combinable * pages in a zone. */ static inline int page_zone_id(struct page *page) { return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } #ifdef NODE_NOT_IN_PAGE_FLAGS extern int page_to_nid(const struct page *page); #else static inline int page_to_nid(const struct page *page) { struct page *p = (struct page *)page; return (PF_POISONED_CHECK(p)->flags >> NODES_PGSHIFT) & NODES_MASK; } #endif #ifdef CONFIG_NUMA_BALANCING static inline int cpu_pid_to_cpupid(int cpu, int pid) { return ((cpu & LAST__CPU_MASK) << LAST__PID_SHIFT) | (pid & LAST__PID_MASK); } static inline int cpupid_to_pid(int cpupid) { return cpupid & LAST__PID_MASK; } static inline int cpupid_to_cpu(int cpupid) { return (cpupid >> LAST__PID_SHIFT) & LAST__CPU_MASK; } static inline int cpupid_to_nid(int cpupid) { return cpu_to_node(cpupid_to_cpu(cpupid)); } static inline bool cpupid_pid_unset(int cpupid) { return cpupid_to_pid(cpupid) == (-1 & LAST__PID_MASK); } static inline bool cpupid_cpu_unset(int cpupid) { return cpupid_to_cpu(cpupid) == (-1 & LAST__CPU_MASK); } static inline bool __cpupid_match_pid(pid_t task_pid, int cpupid) { return (task_pid & LAST__PID_MASK) == cpupid_to_pid(cpupid); } #define cpupid_match_pid(task, cpupid) __cpupid_match_pid(task->pid, cpupid) #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { return xchg(&page->_last_cpupid, cpupid & LAST_CPUPID_MASK); } static inline int page_cpupid_last(struct page *page) { return page->_last_cpupid; } static inline void page_cpupid_reset_last(struct page *page) { page->_last_cpupid = -1 & LAST_CPUPID_MASK; } #else static inline int page_cpupid_last(struct page *page) { return (page->flags >> LAST_CPUPID_PGSHIFT) & LAST_CPUPID_MASK; } extern int page_cpupid_xchg_last(struct page *page, int cpupid); static inline void page_cpupid_reset_last(struct page *page) { page->flags |= LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT; } #endif /* LAST_CPUPID_NOT_IN_PAGE_FLAGS */ #else /* !CONFIG_NUMA_BALANCING */ static inline int page_cpupid_xchg_last(struct page *page, int cpupid) { return page_to_nid(page); /* XXX */ } static inline int page_cpupid_last(struct page *page) { return page_to_nid(page); /* XXX */ } static inline int cpupid_to_nid(int cpupid) { return -1; } static inline int cpupid_to_pid(int cpupid) { return -1; } static inline int cpupid_to_cpu(int cpupid) { return -1; } static inline int cpu_pid_to_cpupid(int nid, int pid) { return -1; } static inline bool cpupid_pid_unset(int cpupid) { return true; } static inline void page_cpupid_reset_last(struct page *page) { } static inline bool cpupid_match_pid(struct task_struct *task, int cpupid) { return false; } #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_KASAN_SW_TAGS /* * KASAN per-page tags are stored xor'ed with 0xff. This allows to avoid * setting tags for all pages to native kernel tag value 0xff, as the default * value 0x00 maps to 0xff. */ static inline u8 page_kasan_tag(const struct page *page) { u8 tag; tag = (page->flags >> KASAN_TAG_PGSHIFT) & KASAN_TAG_MASK; tag ^= 0xff; return tag; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { tag ^= 0xff; page->flags &= ~(KASAN_TAG_MASK << KASAN_TAG_PGSHIFT); page->flags |= (tag & KASAN_TAG_MASK) << KASAN_TAG_PGSHIFT; } static inline void page_kasan_tag_reset(struct page *page) { page_kasan_tag_set(page, 0xff); } #else static inline u8 page_kasan_tag(const struct page *page) { return 0xff; } static inline void page_kasan_tag_set(struct page *page, u8 tag) { } static inline void page_kasan_tag_reset(struct page *page) { } #endif static inline struct zone *page_zone(const struct page *page) { return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)]; } static inline pg_data_t *page_pgdat(const struct page *page) { return NODE_DATA(page_to_nid(page)); } #ifdef SECTION_IN_PAGE_FLAGS static inline void set_page_section(struct page *page, unsigned long section) { page->flags &= ~(SECTIONS_MASK << SECTIONS_PGSHIFT); page->flags |= (section & SECTIONS_MASK) << SECTIONS_PGSHIFT; } static inline unsigned long page_to_section(const struct page *page) { return (page->flags >> SECTIONS_PGSHIFT) & SECTIONS_MASK; } #endif static inline void set_page_zone(struct page *page, enum zone_type zone) { page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT); page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT; } static inline void set_page_node(struct page *page, unsigned long node) { page->flags &= ~(NODES_MASK << NODES_PGSHIFT); page->flags |= (node & NODES_MASK) << NODES_PGSHIFT; } static inline void set_page_links(struct page *page, enum zone_type zone, unsigned long node, unsigned long pfn) { set_page_zone(page, zone); set_page_node(page, node); #ifdef SECTION_IN_PAGE_FLAGS set_page_section(page, pfn_to_section_nr(pfn)); #endif } #ifdef CONFIG_MEMCG static inline struct mem_cgroup *page_memcg(struct page *page) { return page->mem_cgroup; } static inline struct mem_cgroup *page_memcg_rcu(struct page *page) { WARN_ON_ONCE(!rcu_read_lock_held()); return READ_ONCE(page->mem_cgroup); } #else static inline struct mem_cgroup *page_memcg(struct page *page) { return NULL; } static inline struct mem_cgroup *page_memcg_rcu(struct page *page) { WARN_ON_ONCE(!rcu_read_lock_held()); return NULL; } #endif /* * Some inline functions in vmstat.h depend on page_zone() */ #include <linux/vmstat.h> static __always_inline void *lowmem_page_address(const struct page *page) { return page_to_virt(page); } #if defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) #define HASHED_PAGE_VIRTUAL #endif #if defined(WANT_PAGE_VIRTUAL) static inline void *page_address(const struct page *page) { return page->virtual; } static inline void set_page_address(struct page *page, void *address) { page->virtual = address; } #define page_address_init() do { } while(0) #endif #if defined(HASHED_PAGE_VIRTUAL) void *page_address(const struct page *page); void set_page_address(struct page *page, void *virtual); void page_address_init(void); #endif #if !defined(HASHED_PAGE_VIRTUAL) && !defined(WANT_PAGE_VIRTUAL) #define page_address(page) lowmem_page_address(page) #define set_page_address(page, address) do { } while(0) #define page_address_init() do { } while(0) #endif extern void *page_rmapping(struct page *page); extern struct anon_vma *page_anon_vma(struct page *page); extern struct address_space *page_mapping(struct page *page); extern struct address_space *__page_file_mapping(struct page *); static inline struct address_space *page_file_mapping(struct page *page) { if (unlikely(PageSwapCache(page))) return __page_file_mapping(page); return page->mapping; } extern pgoff_t __page_file_index(struct page *page); /* * Return the pagecache index of the passed page. Regular pagecache pages * use ->index whereas swapcache pages use swp_offset(->private) */ static inline pgoff_t page_index(struct page *page) { if (unlikely(PageSwapCache(page))) return __page_file_index(page); return page->index; } bool page_mapped(struct page *page); struct address_space *page_mapping(struct page *page); struct address_space *page_mapping_file(struct page *page); /* * Return true only if the page has been allocated with * ALLOC_NO_WATERMARKS and the low watermark was not * met implying that the system is under some pressure. */ static inline bool page_is_pfmemalloc(struct page *page) { /* * Page index cannot be this large so this must be * a pfmemalloc page. */ return page->index == -1UL; } /* * Only to be called by the page allocator on a freshly allocated * page. */ static inline void set_page_pfmemalloc(struct page *page) { page->index = -1UL; } static inline void clear_page_pfmemalloc(struct page *page) { page->index = 0; } /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. */ extern void pagefault_out_of_memory(void); #define offset_in_page(p) ((unsigned long)(p) & ~PAGE_MASK) #define offset_in_thp(page, p) ((unsigned long)(p) & (thp_size(page) - 1)) /* * Flags passed to show_mem() and show_free_areas() to suppress output in * various contexts. */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); #ifdef CONFIG_MMU extern bool can_do_mlock(void); #else static inline bool can_do_mlock(void) { return false; } #endif extern int user_shm_lock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct user_struct *); /* * Parameter block passed down to zap_pte_range in exceptional cases. */ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ struct page *single_page; /* Locked page to be unmapped */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte); struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t pmd); void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long start, unsigned long end); struct mmu_notifier_range; void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); int follow_invalidate_pte(struct mm_struct *mm, unsigned long address, struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, unsigned int flags, unsigned long *prot, resource_size_t *phys); int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); void truncate_pagecache_range(struct inode *inode, loff_t offset, loff_t end); int truncate_inode_page(struct address_space *mapping, struct page *page); int generic_error_remove_page(struct address_space *mapping, struct page *page); int invalidate_inode_page(struct page *page); #ifdef CONFIG_MMU extern vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs); extern int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); void unmap_mapping_page(struct page *page); void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags, struct pt_regs *regs) { /* should never happen if there's no MMU */ BUG(); return VM_FAULT_SIGBUS; } static inline int fixup_user_fault(struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked) { /* should never happen if there's no MMU */ BUG(); return -EFAULT; } static inline void unmap_mapping_page(struct page *page) { } static inline void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t nr, bool even_cows) { } static inline void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif static inline void unmap_shared_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen) { unmap_mapping_range(mapping, holebegin, holelen, 0); } extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); long get_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); long pin_user_pages_remote(struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc); int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); /* Container for pinned pfns / pages */ struct frame_vector { unsigned int nr_allocated; /* Number of frames we have space for */ unsigned int nr_frames; /* Number of frames stored in ptrs array */ bool got_ref; /* Did we pin pages by getting page ref? */ bool is_pfns; /* Does array contain pages or pfns? */ void *ptrs[]; /* Array of pinned pfns / pages. Use * pfns_vector_pages() or pfns_vector_pfns() * for access */ }; struct frame_vector *frame_vector_create(unsigned int nr_frames); void frame_vector_destroy(struct frame_vector *vec); int get_vaddr_frames(unsigned long start, unsigned int nr_pfns, unsigned int gup_flags, struct frame_vector *vec); void put_vaddr_frames(struct frame_vector *vec); int frame_vector_to_pages(struct frame_vector *vec); void frame_vector_to_pfns(struct frame_vector *vec); static inline unsigned int frame_vector_count(struct frame_vector *vec) { return vec->nr_frames; } static inline struct page **frame_vector_pages(struct frame_vector *vec) { if (vec->is_pfns) { int err = frame_vector_to_pages(vec); if (err) return ERR_PTR(err); } return (struct page **)(vec->ptrs); } static inline unsigned long *frame_vector_pfns(struct frame_vector *vec) { if (!vec->is_pfns) frame_vector_to_pfns(vec); return (unsigned long *)(vec->ptrs); } struct kvec; int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, struct page **pages); int get_kernel_page(unsigned long start, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); extern void do_invalidatepage(struct page *page, unsigned int offset, unsigned int length); void __set_page_dirty(struct page *, struct address_space *, int warn); int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_no_writeback(struct page *page); int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page); void account_page_dirtied(struct page *page, struct address_space *mapping); void account_page_cleaned(struct page *page, struct address_space *mapping, struct bdi_writeback *wb); int set_page_dirty(struct page *page); int set_page_dirty_lock(struct page *page); void __cancel_dirty_page(struct page *page); static inline void cancel_dirty_page(struct page *page) { /* Avoid atomic ops, locking, etc. when not actually needed. */ if (PageDirty(page)) __cancel_dirty_page(page); } int clear_page_dirty_for_io(struct page *page); int get_cmdline(struct task_