1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0 */ /* * * Generic internet FLOW. * */ #ifndef _NET_FLOW_H #define _NET_FLOW_H #include <linux/socket.h> #include <linux/in6.h> #include <linux/atomic.h> #include <net/flow_dissector.h> #include <linux/uidgid.h> /* * ifindex generation is per-net namespace, and loopback is * always the 1st device in ns (see net_dev_init), thus any * loopback device should get ifindex 1 */ #define LOOPBACK_IFINDEX 1 struct flowi_tunnel { __be64 tun_id; }; struct flowi_common { int flowic_oif; int flowic_iif; __u32 flowic_mark; __u8 flowic_tos; __u8 flowic_scope; __u8 flowic_proto; __u8 flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH 0x02 #define FLOWI_FLAG_SKIP_NH_OIF 0x04 __u32 flowic_secid; kuid_t flowic_uid; struct flowi_tunnel flowic_tun_key; __u32 flowic_multipath_hash; }; union flowi_uli { struct { __be16 dport; __be16 sport; } ports; struct { __u8 type; __u8 code; } icmpt; struct { __le16 dport; __le16 sport; } dnports; __be32 spi; __be32 gre_key; struct { __u8 type; } mht; }; struct flowi4 { struct flowi_common __fl_common; #define flowi4_oif __fl_common.flowic_oif #define flowi4_iif __fl_common.flowic_iif #define flowi4_mark __fl_common.flowic_mark #define flowi4_tos __fl_common.flowic_tos #define flowi4_scope __fl_common.flowic_scope #define flowi4_proto __fl_common.flowic_proto #define flowi4_flags __fl_common.flowic_flags #define flowi4_secid __fl_common.flowic_secid #define flowi4_tun_key __fl_common.flowic_tun_key #define flowi4_uid __fl_common.flowic_uid #define flowi4_multipath_hash __fl_common.flowic_multipath_hash /* (saddr,daddr) must be grouped, same order as in IP header */ __be32 saddr; __be32 daddr; union flowi_uli uli; #define fl4_sport uli.ports.sport #define fl4_dport uli.ports.dport #define fl4_icmp_type uli.icmpt.type #define fl4_icmp_code uli.icmpt.code #define fl4_ipsec_spi uli.spi #define fl4_mh_type uli.mht.type #define fl4_gre_key uli.gre_key } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline void flowi4_init_output(struct flowi4 *fl4, int oif, __u32 mark, __u8 tos, __u8 scope, __u8 proto, __u8 flags, __be32 daddr, __be32 saddr, __be16 dport, __be16 sport, kuid_t uid) { fl4->flowi4_oif = oif; fl4->flowi4_iif = LOOPBACK_IFINDEX; fl4->flowi4_mark = mark; fl4->flowi4_tos = tos; fl4->flowi4_scope = scope; fl4->flowi4_proto = proto; fl4->flowi4_flags = flags; fl4->flowi4_secid = 0; fl4->flowi4_tun_key.tun_id = 0; fl4->flowi4_uid = uid; fl4->daddr = daddr; fl4->saddr = saddr; fl4->fl4_dport = dport; fl4->fl4_sport = sport; fl4->flowi4_multipath_hash = 0; } /* Reset some input parameters after previous lookup */ static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos, __be32 daddr, __be32 saddr) { fl4->flowi4_oif = oif; fl4->flowi4_tos = tos; fl4->daddr = daddr; fl4->saddr = saddr; } struct flowi6 { struct flowi_common __fl_common; #define flowi6_oif __fl_common.flowic_oif #define flowi6_iif __fl_common.flowic_iif #define flowi6_mark __fl_common.flowic_mark #define flowi6_scope __fl_common.flowic_scope #define flowi6_proto __fl_common.flowic_proto #define flowi6_flags __fl_common.flowic_flags #define flowi6_secid __fl_common.flowic_secid #define flowi6_tun_key __fl_common.flowic_tun_key #define flowi6_uid __fl_common.flowic_uid struct in6_addr daddr; struct in6_addr saddr; /* Note: flowi6_tos is encoded in flowlabel, too. */ __be32 flowlabel; union flowi_uli uli; #define fl6_sport uli.ports.sport #define fl6_dport uli.ports.dport #define fl6_icmp_type uli.icmpt.type #define fl6_icmp_code uli.icmpt.code #define fl6_ipsec_spi uli.spi #define fl6_mh_type uli.mht.type #define fl6_gre_key uli.gre_key __u32 mp_hash; } __attribute__((__aligned__(BITS_PER_LONG/8))); struct flowidn { struct flowi_common __fl_common; #define flowidn_oif __fl_common.flowic_oif #define flowidn_iif __fl_common.flowic_iif #define flowidn_mark __fl_common.flowic_mark #define flowidn_scope __fl_common.flowic_scope #define flowidn_proto __fl_common.flowic_proto #define flowidn_flags __fl_common.flowic_flags __le16 daddr; __le16 saddr; union flowi_uli uli; #define fld_sport uli.ports.sport #define fld_dport uli.ports.dport } __attribute__((__aligned__(BITS_PER_LONG/8))); struct flowi { union { struct flowi_common __fl_common; struct flowi4 ip4; struct flowi6 ip6; struct flowidn dn; } u; #define flowi_oif u.__fl_common.flowic_oif #define flowi_iif u.__fl_common.flowic_iif #define flowi_mark u.__fl_common.flowic_mark #define flowi_tos u.__fl_common.flowic_tos #define flowi_scope u.__fl_common.flowic_scope #define flowi_proto u.__fl_common.flowic_proto #define flowi_flags u.__fl_common.flowic_flags #define flowi_secid u.__fl_common.flowic_secid #define flowi_tun_key u.__fl_common.flowic_tun_key #define flowi_uid u.__fl_common.flowic_uid } __attribute__((__aligned__(BITS_PER_LONG/8))); static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) { return container_of(fl4, struct flowi, u.ip4); } static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) { return container_of(fl6, struct flowi, u.ip6); } static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn) { return container_of(fldn, struct flowi, u.dn); } __u32 __get_hash_from_flowi6(const struct flowi6 *fl6, struct flow_keys *keys); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KDEV_T_H #define _LINUX_KDEV_T_H #include <uapi/linux/kdev_t.h> #define MINORBITS 20 #define MINORMASK ((1U << MINORBITS) - 1) #define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) #define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) #define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) #define print_dev_t(buffer, dev) \ sprintf((buffer), "%u:%u\n", MAJOR(dev), MINOR(dev)) #define format_dev_t(buffer, dev) \ ({ \ sprintf(buffer, "%u:%u", MAJOR(dev), MINOR(dev)); \ buffer; \ }) /* acceptable for old filesystems */ static __always_inline bool old_valid_dev(dev_t dev) { return MAJOR(dev) < 256 && MINOR(dev) < 256; } static __always_inline u16 old_encode_dev(dev_t dev) { return (MAJOR(dev) << 8) | MINOR(dev); } static __always_inline dev_t old_decode_dev(u16 val) { return MKDEV((val >> 8) & 255, val & 255); } static __always_inline u32 new_encode_dev(dev_t dev) { unsigned major = MAJOR(dev); unsigned minor = MINOR(dev); return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12); } static __always_inline dev_t new_decode_dev(u32 dev) { unsigned major = (dev & 0xfff00) >> 8; unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00); return MKDEV(major, minor); } static __always_inline u64 huge_encode_dev(dev_t dev) { return new_encode_dev(dev); } static __always_inline dev_t huge_decode_dev(u64 dev) { return new_decode_dev(dev); } static __always_inline int sysv_valid_dev(dev_t dev) { return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18); } static __always_inline u32 sysv_encode_dev(dev_t dev) { return MINOR(dev) | (MAJOR(dev) << 18); } static __always_inline unsigned sysv_major(u32 dev) { return (dev >> 18) & 0x3fff; } static __always_inline unsigned sysv_minor(u32 dev) { return dev & 0x3ffff; } #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 // SPDX-License-Identifier: GPL-2.0-or-later /* * xfrm_device.c - IPsec device offloading code. * * Copyright (c) 2015 secunet Security Networks AG * * Author: * Steffen Klassert <steffen.klassert@secunet.com> */ #include <linux/errno.h> #include <linux/module.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <net/dst.h> #include <net/xfrm.h> #include <linux/notifier.h> #ifdef CONFIG_XFRM_OFFLOAD static void __xfrm_transport_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); skb_reset_mac_len(skb); if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header -= x->props.header_len; pskb_pull(skb, skb_transport_offset(skb) + x->props.header_len); } static void __xfrm_mode_tunnel_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); pskb_pull(skb, skb->mac_len + x->props.header_len); } static void __xfrm_mode_beet_prep(struct xfrm_state *x, struct sk_buff *skb, unsigned int hsize) { struct xfrm_offload *xo = xfrm_offload(skb); int phlen = 0; if (xo->flags & XFRM_GSO_SEGMENT) skb->transport_header = skb->network_header + hsize; skb_reset_mac_len(skb); if (x->sel.family != AF_INET6) { phlen = IPV4_BEET_PHMAXLEN; if (x->outer_mode.family == AF_INET6) phlen += sizeof(struct ipv6hdr) - sizeof(struct iphdr); } pskb_pull(skb, skb->mac_len + hsize + (x->props.header_len - phlen)); } /* Adjust pointers into the packet when IPsec is done at layer2 */ static void xfrm_outer_mode_prep(struct xfrm_state *x, struct sk_buff *skb) { switch (x->outer_mode.encap) { case XFRM_MODE_TUNNEL: if (x->outer_mode.family == AF_INET) return __xfrm_mode_tunnel_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_mode_tunnel_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_TRANSPORT: if (x->outer_mode.family == AF_INET) return __xfrm_transport_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_transport_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_BEET: if (x->outer_mode.family == AF_INET) return __xfrm_mode_beet_prep(x, skb, sizeof(struct iphdr)); if (x->outer_mode.family == AF_INET6) return __xfrm_mode_beet_prep(x, skb, sizeof(struct ipv6hdr)); break; case XFRM_MODE_ROUTEOPTIMIZATION: case XFRM_MODE_IN_TRIGGER: break; } } struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t features, bool *again) { int err; unsigned long flags; struct xfrm_state *x; struct softnet_data *sd; struct sk_buff *skb2, *nskb, *pskb = NULL; netdev_features_t esp_features = features; struct xfrm_offload *xo = xfrm_offload(skb); struct net_device *dev = skb->dev; struct sec_path *sp; if (!xo || (xo->flags & XFRM_XMIT)) return skb; if (!(features & NETIF_F_HW_ESP)) esp_features = features & ~(NETIF_F_SG | NETIF_F_CSUM_MASK); sp = skb_sec_path(skb); x = sp->xvec[sp->len - 1]; if (xo->flags & XFRM_GRO || x->xso.flags & XFRM_OFFLOAD_INBOUND) return skb; /* This skb was already validated on the upper/virtual dev */ if ((x->xso.dev != dev) && (x->xso.real_dev == dev)) return skb; local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); err = !skb_queue_empty(&sd->xfrm_backlog); local_irq_restore(flags); if (err) { *again = true; return skb; } if (skb_is_gso(skb) && unlikely(x->xso.dev != dev)) { struct sk_buff *segs; /* Packet got rerouted, fixup features and segment it. */ esp_features = esp_features & ~(NETIF_F_HW_ESP | NETIF_F_GSO_ESP); segs = skb_gso_segment(skb, esp_features); if (IS_ERR(segs)) { kfree_skb(skb); atomic_long_inc(&dev->tx_dropped); return NULL; } else { consume_skb(skb); skb = segs; } } if (!skb->next) { esp_features |= skb->dev->gso_partial_features; xfrm_outer_mode_prep(x, skb); xo->flags |= XFRM_DEV_RESUME; err = x->type_offload->xmit(x, skb, esp_features); if (err) { if (err == -EINPROGRESS) return NULL; XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); kfree_skb(skb); return NULL; } skb_push(skb, skb->data - skb_mac_header(skb)); return skb; } skb_list_walk_safe(skb, skb2, nskb) { esp_features |= skb->dev->gso_partial_features; skb_mark_not_on_list(skb2); xo = xfrm_offload(skb2); xo->flags |= XFRM_DEV_RESUME; xfrm_outer_mode_prep(x, skb2); err = x->type_offload->xmit(x, skb2, esp_features); if (!err) { skb2->next = nskb; } else if (err != -EINPROGRESS) { XFRM_INC_STATS(xs_net(x), LINUX_MIB_XFRMOUTSTATEPROTOERROR); skb2->next = nskb; kfree_skb_list(skb2); return NULL; } else { if (skb == skb2) skb = nskb; else pskb->next = nskb; continue; } skb_push(skb2, skb2->data - skb_mac_header(skb2)); pskb = skb2; } return skb; } EXPORT_SYMBOL_GPL(validate_xmit_xfrm); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, struct xfrm_user_offload *xuo) { int err; struct dst_entry *dst; struct net_device *dev; struct xfrm_state_offload *xso = &x->xso; xfrm_address_t *saddr; xfrm_address_t *daddr; if (!x->type_offload) return -EINVAL; /* We don't yet support UDP encapsulation and TFC padding. */ if (x->encap || x->tfcpad) return -EINVAL; dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { if (!(xuo->flags & XFRM_OFFLOAD_INBOUND)) { saddr = &x->props.saddr; daddr = &x->id.daddr; } else { saddr = &x->id.daddr; daddr = &x->props.saddr; } dst = __xfrm_dst_lookup(net, 0, 0, saddr, daddr, x->props.family, xfrm_smark_get(0, x)); if (IS_ERR(dst)) return 0; dev = dst->dev; dev_hold(dev); dst_release(dst); } if (!dev->xfrmdev_ops || !dev->xfrmdev_ops->xdo_dev_state_add) { xso->dev = NULL; dev_put(dev); return 0; } if (x->props.flags & XFRM_STATE_ESN && !dev->xfrmdev_ops->xdo_dev_state_advance_esn) { xso->dev = NULL; dev_put(dev); return -EINVAL; } xso->dev = dev; xso->real_dev = dev; xso->num_exthdrs = 1; xso->flags = xuo->flags; err = dev->xfrmdev_ops->xdo_dev_state_add(x); if (err) { xso->num_exthdrs = 0; xso->flags = 0; xso->dev = NULL; xso->real_dev = NULL; dev_put(dev); if (err != -EOPNOTSUPP) return err; } return 0; } EXPORT_SYMBOL_GPL(xfrm_dev_state_add); bool xfrm_dev_offload_ok(struct sk_buff *skb, struct xfrm_state *x) { int mtu; struct dst_entry *dst = skb_dst(skb); struct xfrm_dst *xdst = (struct xfrm_dst *)dst; struct net_device *dev = x->xso.dev; if (!x->type_offload || x->encap) return false; if ((!dev || (dev == xfrm_dst_path(dst)->dev)) && (!xdst->child->xfrm)) { mtu = xfrm_state_mtu(x, xdst->child_mtu_cached); if (skb->len <= mtu) goto ok; if (skb_is_gso(skb) && skb_gso_validate_network_len(skb, mtu)) goto ok; } return false; ok: if (dev && dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_offload_ok) return x->xso.dev->xfrmdev_ops->xdo_dev_offload_ok(skb, x); return true; } EXPORT_SYMBOL_GPL(xfrm_dev_offload_ok); void xfrm_dev_resume(struct sk_buff *skb) { struct net_device *dev = skb->dev; int ret = NETDEV_TX_BUSY; struct netdev_queue *txq; struct softnet_data *sd; unsigned long flags; rcu_read_lock(); txq = netdev_core_pick_tx(dev, skb, NULL); HARD_TX_LOCK(dev, txq, smp_processor_id()); if (!netif_xmit_frozen_or_stopped(txq)) skb = dev_hard_start_xmit(skb, dev, txq, &ret); HARD_TX_UNLOCK(dev, txq); if (!dev_xmit_complete(ret)) { local_irq_save(flags); sd = this_cpu_ptr(&softnet_data); skb_queue_tail(&sd->xfrm_backlog, skb); raise_softirq_irqoff(NET_TX_SOFTIRQ); local_irq_restore(flags); } rcu_read_unlock(); } EXPORT_SYMBOL_GPL(xfrm_dev_resume); void xfrm_dev_backlog(struct softnet_data *sd) { struct sk_buff_head *xfrm_backlog = &sd->xfrm_backlog; struct sk_buff_head list; struct sk_buff *skb; if (skb_queue_empty(xfrm_backlog)) return; __skb_queue_head_init(&list); spin_lock(&xfrm_backlog->lock); skb_queue_splice_init(xfrm_backlog, &list); spin_unlock(&xfrm_backlog->lock); while (!skb_queue_empty(&list)) { skb = __skb_dequeue(&list); xfrm_dev_resume(skb); } } #endif static int xfrm_api_check(struct net_device *dev) { #ifdef CONFIG_XFRM_OFFLOAD if ((dev->features & NETIF_F_HW_ESP_TX_CSUM) && !(dev->features & NETIF_F_HW_ESP)) return NOTIFY_BAD; if ((dev->features & NETIF_F_HW_ESP) && (!(dev->xfrmdev_ops && dev->xfrmdev_ops->xdo_dev_state_add && dev->xfrmdev_ops->xdo_dev_state_delete))) return NOTIFY_BAD; #else if (dev->features & (NETIF_F_HW_ESP | NETIF_F_HW_ESP_TX_CSUM)) return NOTIFY_BAD; #endif return NOTIFY_DONE; } static int xfrm_dev_register(struct net_device *dev) { return xfrm_api_check(dev); } static int xfrm_dev_feat_change(struct net_device *dev) { return xfrm_api_check(dev); } static int xfrm_dev_down(struct net_device *dev) { if (dev->features & NETIF_F_HW_ESP) xfrm_dev_state_flush(dev_net(dev), dev, true); return NOTIFY_DONE; } static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); switch (event) { case NETDEV_REGISTER: return xfrm_dev_register(dev); case NETDEV_FEAT_CHANGE: return xfrm_dev_feat_change(dev); case NETDEV_DOWN: case NETDEV_UNREGISTER: return xfrm_dev_down(dev); } return NOTIFY_DONE; } static struct notifier_block xfrm_dev_notifier = { .notifier_call = xfrm_dev_event, }; void __init xfrm_dev_init(void) { register_netdevice_notifier(&xfrm_dev_notifier); }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ #include <linux/rhashtable-types.h> #include <linux/completion.h> /* Per netns frag queues directory */ struct fqdir { /* sysctls */ long high_thresh; long low_thresh; int timeout; int max_dist; struct inet_frags *f; struct net *net; bool dead; struct rhashtable rhashtable ____cacheline_aligned_in_smp; /* Keep atomic mem on separate cachelines in structs that include it */ atomic_long_t mem ____cacheline_aligned_in_smp; struct work_struct destroy_work; }; /** * fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived * @INET_FRAG_COMPLETE: frag queue has been processed and is due for destruction * @INET_FRAG_HASH_DEAD: inet_frag_kill() has not removed fq from rhashtable */ enum { INET_FRAG_FIRST_IN = BIT(0), INET_FRAG_LAST_IN = BIT(1), INET_FRAG_COMPLETE = BIT(2), INET_FRAG_HASH_DEAD = BIT(3), }; struct frag_v4_compare_key { __be32 saddr; __be32 daddr; u32 user; u32 vif; __be16 id; u16 protocol; }; struct frag_v6_compare_key { struct in6_addr saddr; struct in6_addr daddr; u32 user; __be32 id; u32 iif; }; /** * struct inet_frag_queue - fragment queue * * @node: rhash node * @key: keys identifying this frag. * @timer: queue expiration timer * @lock: spinlock protecting this frag * @refcnt: reference count of the queue * @rb_fragments: received fragments rb-tree root * @fragments_tail: received fragments tail * @last_run_head: the head of the last "run". see ip_fragment.c * @stamp: timestamp of the last received fragment * @len: total length of the original datagram * @meat: length of received fragments so far * @flags: fragment queue flags * @max_size: maximum received fragment size * @fqdir: pointer to struct fqdir * @rcu: rcu head for freeing deferall */ struct inet_frag_queue { struct rhash_head node; union { struct frag_v4_compare_key v4; struct frag_v6_compare_key v6; } key; struct timer_list timer; spinlock_t lock; refcount_t refcnt; struct rb_root rb_fragments; struct sk_buff *fragments_tail; struct sk_buff *last_run_head; ktime_t stamp; int len; int meat; __u8 flags; u16 max_size; struct fqdir *fqdir; struct rcu_head rcu; }; struct inet_frags { unsigned int qsize; void (*constructor)(struct inet_frag_queue *q, const void *arg); void (*destructor)(struct inet_frag_queue *); void (*frag_expire)(struct timer_list *t); struct kmem_cache *frags_cachep; const char *frags_cache_name; struct rhashtable_params rhash_params; refcount_t refcnt; struct completion completion; }; int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); int fqdir_init(struct fqdir **fqdirp, struct inet_frags *f, struct net *net); static inline void fqdir_pre_exit(struct fqdir *fqdir) { fqdir->high_thresh = 0; /* prevent creation of new frags */ fqdir->dead = true; } void fqdir_exit(struct fqdir *fqdir); void inet_frag_kill(struct inet_frag_queue *q); void inet_frag_destroy(struct inet_frag_queue *q); struct inet_frag_queue *inet_frag_find(struct fqdir *fqdir, void *key); /* Free all skbs in the queue; return the sum of their truesizes. */ unsigned int inet_frag_rbtree_purge(struct rb_root *root); static inline void inet_frag_put(struct inet_frag_queue *q) { if (refcount_dec_and_test(&q->refcnt)) inet_frag_destroy(q); } /* Memory Tracking Functions. */ static inline long frag_mem_limit(const struct fqdir *fqdir) { return atomic_long_read(&fqdir->mem); } static inline void sub_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_sub(val, &fqdir->mem); } static inline void add_frag_mem_limit(struct fqdir *fqdir, long val) { atomic_long_add(val, &fqdir->mem); } /* RFC 3168 support : * We want to check ECN values of all fragments, do detect invalid combinations. * In ipq->ecn, we store the OR value of each ip4_frag_ecn() fragment value. */ #define IPFRAG_ECN_NOT_ECT 0x01 /* one frag had ECN_NOT_ECT */ #define IPFRAG_ECN_ECT_1 0x02 /* one frag had ECN_ECT_1 */ #define IPFRAG_ECN_ECT_0 0x04 /* one frag had ECN_ECT_0 */ #define IPFRAG_ECN_CE 0x08 /* one frag had ECN_CE */ extern const u8 ip_frag_ecn_table[16]; /* Return values of inet_frag_queue_insert() */ #define IPFRAG_OK 0 #define IPFRAG_DUP 1 #define IPFRAG_OVERLAP 2 int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, int offset, int end); void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce); struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM sched #if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include <linux/sched/numa_balancing.h> #include <linux/tracepoint.h> #include <linux/binfmts.h> /* * Tracepoint for calling kthread_stop, performed to end a kthread: */ TRACE_EVENT(sched_kthread_stop, TP_PROTO(struct task_struct *t), TP_ARGS(t), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, t->comm, TASK_COMM_LEN); __entry->pid = t->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); /* * Tracepoint for the return value of the kthread stopping: */ TRACE_EVENT(sched_kthread_stop_ret, TP_PROTO(int ret), TP_ARGS(ret), TP_STRUCT__entry( __field( int, ret ) ), TP_fast_assign( __entry->ret = ret; ), TP_printk("ret=%d", __entry->ret) ); /* * Tracepoint for waking up a task: */ DECLARE_EVENT_CLASS(sched_wakeup_template, TP_PROTO(struct task_struct *p), TP_ARGS(__perf_task(p)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, success ) __field( int, target_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->success = 1; /* rudiment, kill when possible */ __entry->target_cpu = task_cpu(p); ), TP_printk("comm=%s pid=%d prio=%d target_cpu=%03d", __entry->comm, __entry->pid, __entry->prio, __entry->target_cpu) ); /* * Tracepoint called when waking a task; this tracepoint is guaranteed to be * called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_waking, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint called when the task is actually woken; p->state == TASK_RUNNNG. * It is not always called from the waking context. */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waking up a new task: */ DEFINE_EVENT(sched_wakeup_template, sched_wakeup_new, TP_PROTO(struct task_struct *p), TP_ARGS(p)); #ifdef CREATE_TRACE_POINTS static inline long __trace_sched_switch_state(bool preempt, struct task_struct *p) { unsigned int state; #ifdef CONFIG_SCHED_DEBUG BUG_ON(p != current); #endif /* CONFIG_SCHED_DEBUG */ /* * Preemption ignores task state, therefore preempted tasks are always * RUNNING (we will not have dequeued if state != RUNNING). */ if (preempt) return TASK_REPORT_MAX; /* * task_state_index() uses fls() and returns a value from 0-8 range. * Decrement it by 1 (except TASK_RUNNING state i.e 0) before using * it for left shift operation to get the correct task->state * mapping. */ state = task_state_index(p); return state ? (1 << (state - 1)) : state; } #endif /* CREATE_TRACE_POINTS */ /* * Tracepoint for task switches, performed by the scheduler: */ TRACE_EVENT(sched_switch, TP_PROTO(bool preempt, struct task_struct *prev, struct task_struct *next), TP_ARGS(preempt, prev, next), TP_STRUCT__entry( __array( char, prev_comm, TASK_COMM_LEN ) __field( pid_t, prev_pid ) __field( int, prev_prio ) __field( long, prev_state ) __array( char, next_comm, TASK_COMM_LEN ) __field( pid_t, next_pid ) __field( int, next_prio ) ), TP_fast_assign( memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); __entry->prev_pid = prev->pid; __entry->prev_prio = prev->prio; __entry->prev_state = __trace_sched_switch_state(preempt, prev); memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); __entry->next_pid = next->pid; __entry->next_prio = next->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("prev_comm=%s prev_pid=%d prev_prio=%d prev_state=%s%s ==> next_comm=%s next_pid=%d next_prio=%d", __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, (__entry->prev_state & (TASK_REPORT_MAX - 1)) ? __print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|", { TASK_INTERRUPTIBLE, "S" }, { TASK_UNINTERRUPTIBLE, "D" }, { __TASK_STOPPED, "T" }, { __TASK_TRACED, "t" }, { EXIT_DEAD, "X" }, { EXIT_ZOMBIE, "Z" }, { TASK_PARKED, "P" }, { TASK_DEAD, "I" }) : "R", __entry->prev_state & TASK_REPORT_MAX ? "+" : "", __entry->next_comm, __entry->next_pid, __entry->next_prio) ); /* * Tracepoint for a task being migrated: */ TRACE_EVENT(sched_migrate_task, TP_PROTO(struct task_struct *p, int dest_cpu), TP_ARGS(p, dest_cpu), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) __field( int, orig_cpu ) __field( int, dest_cpu ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), TP_printk("comm=%s pid=%d prio=%d orig_cpu=%d dest_cpu=%d", __entry->comm, __entry->pid, __entry->prio, __entry->orig_cpu, __entry->dest_cpu) ); DECLARE_EVENT_CLASS(sched_process_template, TP_PROTO(struct task_struct *p), TP_ARGS(p), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for freeing a task: */ DEFINE_EVENT(sched_process_template, sched_process_free, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a task exiting: */ DEFINE_EVENT(sched_process_template, sched_process_exit, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for waiting on task to unschedule: */ DEFINE_EVENT(sched_process_template, sched_wait_task, TP_PROTO(struct task_struct *p), TP_ARGS(p)); /* * Tracepoint for a waiting task: */ TRACE_EVENT(sched_process_wait, TP_PROTO(struct pid *pid), TP_ARGS(pid), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, prio ) ), TP_fast_assign( memcpy(__entry->comm, current->comm, TASK_COMM_LEN); __entry->pid = pid_nr(pid); __entry->prio = current->prio; /* XXX SCHED_DEADLINE */ ), TP_printk("comm=%s pid=%d prio=%d", __entry->comm, __entry->pid, __entry->prio) ); /* * Tracepoint for do_fork: */ TRACE_EVENT(sched_process_fork, TP_PROTO(struct task_struct *parent, struct task_struct *child), TP_ARGS(parent, child), TP_STRUCT__entry( __array( char, parent_comm, TASK_COMM_LEN ) __field( pid_t, parent_pid ) __array( char, child_comm, TASK_COMM_LEN ) __field( pid_t, child_pid ) ), TP_fast_assign( memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); __entry->parent_pid = parent->pid; memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); __entry->child_pid = child->pid; ), TP_printk("comm=%s pid=%d child_comm=%s child_pid=%d", __entry->parent_comm, __entry->parent_pid, __entry->child_comm, __entry->child_pid) ); /* * Tracepoint for exec: */ TRACE_EVENT(sched_process_exec, TP_PROTO(struct task_struct *p, pid_t old_pid, struct linux_binprm *bprm), TP_ARGS(p, old_pid, bprm), TP_STRUCT__entry( __string( filename, bprm->filename ) __field( pid_t, pid ) __field( pid_t, old_pid ) ), TP_fast_assign( __assign_str(filename, bprm->filename); __entry->pid = p->pid; __entry->old_pid = old_pid; ), TP_printk("filename=%s pid=%d old_pid=%d", __get_str(filename), __entry->pid, __entry->old_pid) ); #ifdef CONFIG_SCHEDSTATS #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS #else #define DEFINE_EVENT_SCHEDSTAT DEFINE_EVENT_NOP #define DECLARE_EVENT_CLASS_SCHEDSTAT DECLARE_EVENT_CLASS_NOP #endif /* * XXX the below sched_stat tracepoints only apply to SCHED_OTHER/BATCH/IDLE * adding sched_stat support to SCHED_FIFO/RR would be welcome. */ DECLARE_EVENT_CLASS_SCHEDSTAT(sched_stat_template, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(__perf_task(tsk), __perf_count(delay)), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, delay ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->delay = delay; ), TP_printk("comm=%s pid=%d delay=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->delay) ); /* * Tracepoint for accounting wait time (time the task is runnable * but not actually running due to scheduler contention). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_wait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting sleep time (time the task is not runnable, * including iowait, see below). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_sleep, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting iowait time (time the task is not runnable * due to waiting on IO to complete). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_iowait, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting blocked time (time the task is in uninterruptible). */ DEFINE_EVENT_SCHEDSTAT(sched_stat_template, sched_stat_blocked, TP_PROTO(struct task_struct *tsk, u64 delay), TP_ARGS(tsk, delay)); /* * Tracepoint for accounting runtime (time the task is executing * on a CPU). */ DECLARE_EVENT_CLASS(sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), TP_ARGS(tsk, __perf_count(runtime), vruntime), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( u64, runtime ) __field( u64, vruntime ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->runtime = runtime; __entry->vruntime = vruntime; ), TP_printk("comm=%s pid=%d runtime=%Lu [ns] vruntime=%Lu [ns]", __entry->comm, __entry->pid, (unsigned long long)__entry->runtime, (unsigned long long)__entry->vruntime) ); DEFINE_EVENT(sched_stat_runtime, sched_stat_runtime, TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime), TP_ARGS(tsk, runtime, vruntime)); /* * Tracepoint for showing priority inheritance modifying a tasks * priority. */ TRACE_EVENT(sched_pi_setprio, TP_PROTO(struct task_struct *tsk, struct task_struct *pi_task), TP_ARGS(tsk, pi_task), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) __field( int, oldprio ) __field( int, newprio ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; __entry->oldprio = tsk->prio; __entry->newprio = pi_task ? min(tsk->normal_prio, pi_task->prio) : tsk->normal_prio; /* XXX SCHED_DEADLINE bits missing */ ), TP_printk("comm=%s pid=%d oldprio=%d newprio=%d", __entry->comm, __entry->pid, __entry->oldprio, __entry->newprio) ); #ifdef CONFIG_DETECT_HUNG_TASK TRACE_EVENT(sched_process_hang, TP_PROTO(struct task_struct *tsk), TP_ARGS(tsk), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) __field( pid_t, pid ) ), TP_fast_assign( memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); __entry->pid = tsk->pid; ), TP_printk("comm=%s pid=%d", __entry->comm, __entry->pid) ); #endif /* CONFIG_DETECT_HUNG_TASK */ /* * Tracks migration of tasks from one runqueue to another. Can be used to * detect if automatic NUMA balancing is bouncing between nodes. */ TRACE_EVENT(sched_move_numa, TP_PROTO(struct task_struct *tsk, int src_cpu, int dst_cpu), TP_ARGS(tsk, src_cpu, dst_cpu), TP_STRUCT__entry( __field( pid_t, pid ) __field( pid_t, tgid ) __field( pid_t, ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->pid = task_pid_nr(tsk); __entry->tgid = task_tgid_nr(tsk); __entry->ngid = task_numa_group_id(tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_cpu = dst_cpu; __entry->dst_nid = cpu_to_node(dst_cpu); ), TP_printk("pid=%d tgid=%d ngid=%d src_cpu=%d src_nid=%d dst_cpu=%d dst_nid=%d", __entry->pid, __entry->tgid, __entry->ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_cpu, __entry->dst_nid) ); DECLARE_EVENT_CLASS(sched_numa_pair_template, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu), TP_STRUCT__entry( __field( pid_t, src_pid ) __field( pid_t, src_tgid ) __field( pid_t, src_ngid ) __field( int, src_cpu ) __field( int, src_nid ) __field( pid_t, dst_pid ) __field( pid_t, dst_tgid ) __field( pid_t, dst_ngid ) __field( int, dst_cpu ) __field( int, dst_nid ) ), TP_fast_assign( __entry->src_pid = task_pid_nr(src_tsk); __entry->src_tgid = task_tgid_nr(src_tsk); __entry->src_ngid = task_numa_group_id(src_tsk); __entry->src_cpu = src_cpu; __entry->src_nid = cpu_to_node(src_cpu); __entry->dst_pid = dst_tsk ? task_pid_nr(dst_tsk) : 0; __entry->dst_tgid = dst_tsk ? task_tgid_nr(dst_tsk) : 0; __entry->dst_ngid = dst_tsk ? task_numa_group_id(dst_tsk) : 0; __entry->dst_cpu = dst_cpu; __entry->dst_nid = dst_cpu >= 0 ? cpu_to_node(dst_cpu) : -1; ), TP_printk("src_pid=%d src_tgid=%d src_ngid=%d src_cpu=%d src_nid=%d dst_pid=%d dst_tgid=%d dst_ngid=%d dst_cpu=%d dst_nid=%d", __entry->src_pid, __entry->src_tgid, __entry->src_ngid, __entry->src_cpu, __entry->src_nid, __entry->dst_pid, __entry->dst_tgid, __entry->dst_ngid, __entry->dst_cpu, __entry->dst_nid) ); DEFINE_EVENT(sched_numa_pair_template, sched_stick_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, TP_PROTO(struct task_struct *src_tsk, int src_cpu, struct task_struct *dst_tsk, int dst_cpu), TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); /* * Tracepoint for waking a polling cpu without an IPI. */ TRACE_EVENT(sched_wake_idle_without_ipi, TP_PROTO(int cpu), TP_ARGS(cpu), TP_STRUCT__entry( __field( int, cpu ) ), TP_fast_assign( __entry->cpu = cpu; ), TP_printk("cpu=%d", __entry->cpu) ); /* * Following tracepoints are not exported in tracefs and provide hooking * mechanisms only for testing and debugging purposes. * * Postfixed with _tp to make them easily identifiable in the code. */ DECLARE_TRACE(pelt_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(pelt_rt_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_dl_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_thermal_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_irq_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(pelt_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_cpu_capacity_tp, TP_PROTO(struct rq *rq), TP_ARGS(rq)); DECLARE_TRACE(sched_overutilized_tp, TP_PROTO(struct root_domain *rd, bool overutilized), TP_ARGS(rd, overutilized)); DECLARE_TRACE(sched_util_est_cfs_tp, TP_PROTO(struct cfs_rq *cfs_rq), TP_ARGS(cfs_rq)); DECLARE_TRACE(sched_util_est_se_tp, TP_PROTO(struct sched_entity *se), TP_ARGS(se)); DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 /* SPDX-License-Identifier: GPL-2.0 */ /* Rewritten and vastly simplified by Rusty Russell for in-kernel * module loader: * Copyright 2002 Rusty Russell <rusty@rustcorp.com.au> IBM Corporation */ #ifndef _LINUX_KALLSYMS_H #define _LINUX_KALLSYMS_H #include <linux/errno.h> #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/mm.h> #include <linux/module.h> #include <asm/sections.h> #define KSYM_NAME_LEN 128 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \ 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1) struct cred; struct module; static inline int is_kernel_inittext(unsigned long addr) { if (addr >= (unsigned long)_sinittext && addr <= (unsigned long)_einittext) return 1; return 0; } static inline int is_kernel_text(unsigned long addr) { if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) || arch_is_kernel_text(addr)) return 1; return in_gate_area_no_mm(addr); } static inline int is_kernel(unsigned long addr) { if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end) return 1; return in_gate_area_no_mm(addr); } static inline int is_ksym_addr(unsigned long addr) { if (IS_ENABLED(CONFIG_KALLSYMS_ALL)) return is_kernel(addr); return is_kernel_text(addr) || is_kernel_inittext(addr); } static inline void *dereference_symbol_descriptor(void *ptr) { #ifdef HAVE_DEREFERENCE_FUNCTION_DESCRIPTOR struct module *mod; ptr = dereference_kernel_function_descriptor(ptr); if (is_ksym_addr((unsigned long)ptr)) return ptr; preempt_disable(); mod = __module_address((unsigned long)ptr); preempt_enable(); if (mod) ptr = dereference_module_function_descriptor(mod, ptr); #endif return ptr; } #ifdef CONFIG_KALLSYMS /* Lookup the address for a symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name); /* Call a function on each kallsyms symbol in the core kernel */ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data); extern int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset); /* Lookup an address. modname is set to NULL if it's in the kernel. */ const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf); /* Look up a kernel symbol and return it in a text buffer. */ extern int sprint_symbol(char *buffer, unsigned long address); extern int sprint_symbol_no_offset(char *buffer, unsigned long address); extern int sprint_backtrace(char *buffer, unsigned long address); int lookup_symbol_name(unsigned long addr, char *symname); int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name); /* How and when do we show kallsyms values? */ extern bool kallsyms_show_value(const struct cred *cred); #else /* !CONFIG_KALLSYMS */ static inline unsigned long kallsyms_lookup_name(const char *name) { return 0; } static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, unsigned long), void *data) { return 0; } static inline int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, unsigned long *offset) { return 0; } static inline const char *kallsyms_lookup(unsigned long addr, unsigned long *symbolsize, unsigned long *offset, char **modname, char *namebuf) { return NULL; } static inline int sprint_symbol(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_symbol_no_offset(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int sprint_backtrace(char *buffer, unsigned long addr) { *buffer = '\0'; return 0; } static inline int lookup_symbol_name(unsigned long addr, char *symname) { return -ERANGE; } static inline int lookup_symbol_attrs(unsigned long addr, unsigned long *size, unsigned long *offset, char *modname, char *name) { return -ERANGE; } static inline bool kallsyms_show_value(const struct cred *cred) { return false; } #endif /*CONFIG_KALLSYMS*/ static inline void print_ip_sym(const char *loglvl, unsigned long ip) { printk("%s[<%px>] %pS\n", loglvl, (void *) ip, (void *) ip); } #endif /*_LINUX_KALLSYMS_H*/
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 /* SPDX-License-Identifier: GPL-2.0-or-later */ #ifndef __SOUND_CORE_H #define __SOUND_CORE_H /* * Main header file for the ALSA driver * Copyright (c) 1994-2001 by Jaroslav Kysela <perex@perex.cz> */ #include <linux/device.h> #include <linux/sched.h> /* wake_up() */ #include <linux/mutex.h> /* struct mutex */ #include <linux/rwsem.h> /* struct rw_semaphore */ #include <linux/pm.h> /* pm_message_t */ #include <linux/stringify.h> #include <linux/printk.h> /* number of supported soundcards */ #ifdef CONFIG_SND_DYNAMIC_MINORS #define SNDRV_CARDS CONFIG_SND_MAX_CARDS #else #define SNDRV_CARDS 8 /* don't change - minor numbers */ #endif #define CONFIG_SND_MAJOR 116 /* standard configuration */ /* forward declarations */ struct pci_dev; struct module; struct completion; /* device allocation stuff */ /* type of the object used in snd_device_*() * this also defines the calling order */ enum snd_device_type { SNDRV_DEV_LOWLEVEL, SNDRV_DEV_INFO, SNDRV_DEV_BUS, SNDRV_DEV_CODEC, SNDRV_DEV_PCM, SNDRV_DEV_COMPRESS, SNDRV_DEV_RAWMIDI, SNDRV_DEV_TIMER, SNDRV_DEV_SEQUENCER, SNDRV_DEV_HWDEP, SNDRV_DEV_JACK, SNDRV_DEV_CONTROL, /* NOTE: this must be the last one */ }; enum snd_device_state { SNDRV_DEV_BUILD, SNDRV_DEV_REGISTERED, SNDRV_DEV_DISCONNECTED, }; struct snd_device; struct snd_device_ops { int (*dev_free)(struct snd_device *dev); int (*dev_register)(struct snd_device *dev); int (*dev_disconnect)(struct snd_device *dev); }; struct snd_device { struct list_head list; /* list of registered devices */ struct snd_card *card; /* card which holds this device */ enum snd_device_state state; /* state of the device */ enum snd_device_type type; /* device type */ void *device_data; /* device structure */ const struct snd_device_ops *ops; /* operations */ }; #define snd_device(n) list_entry(n, struct snd_device, list) /* main structure for soundcard */ struct snd_card { int number; /* number of soundcard (index to snd_cards) */ char id[16]; /* id string of this card */ char driver[16]; /* driver name */ char shortname[32]; /* short name of this soundcard */ char longname[80]; /* name of this soundcard */ char irq_descr[32]; /* Interrupt description */ char mixername[80]; /* mixer name */ char components[128]; /* card components delimited with space */ struct module *module; /* top-level module */ void *private_data; /* private data for soundcard */ void (*private_free) (struct snd_card *card); /* callback for freeing of private data */ struct list_head devices; /* devices */ struct device ctl_dev; /* control device */ unsigned int last_numid; /* last used numeric ID */ struct rw_semaphore controls_rwsem; /* controls list lock */ rwlock_t ctl_files_rwlock; /* ctl_files list lock */ int controls_count; /* count of all controls */ int user_ctl_count; /* count of all user controls */ struct list_head controls; /* all controls for this card */ struct list_head ctl_files; /* active control files */ struct snd_info_entry *proc_root; /* root for soundcard specific files */ struct proc_dir_entry *proc_root_link; /* number link to real id */ struct list_head files_list; /* all files associated to this card */ struct snd_shutdown_f_ops *s_f_ops; /* file operations in the shutdown state */ spinlock_t files_lock; /* lock the files for this card */ int shutdown; /* this card is going down */ struct completion *release_completion; struct device *dev; /* device assigned to this card */ struct device card_dev; /* cardX object for sysfs */ const struct attribute_group *dev_groups[4]; /* assigned sysfs attr */ bool registered; /* card_dev is registered? */ int sync_irq; /* assigned irq, used for PCM sync */ wait_queue_head_t remove_sleep; size_t total_pcm_alloc_bytes; /* total amount of allocated buffers */ struct mutex memory_mutex; /* protection for the above */ #ifdef CONFIG_PM unsigned int power_state; /* power state */ wait_queue_head_t power_sleep; #endif #if IS_ENABLED(CONFIG_SND_MIXER_OSS) struct snd_mixer_oss *mixer_oss; int mixer_oss_change_count; #endif }; #define dev_to_snd_card(p) container_of(p, struct snd_card, card_dev) #ifdef CONFIG_PM static inline unsigned int snd_power_get_state(struct snd_card *card) { return card->power_state; } static inline void snd_power_change_state(struct snd_card *card, unsigned int state) { card->power_state = state; wake_up(&card->power_sleep); } /* init.c */ int snd_power_wait(struct snd_card *card, unsigned int power_state); #else /* ! CONFIG_PM */ static inline int snd_power_wait(struct snd_card *card, unsigned int state) { return 0; } #define snd_power_get_state(card) ({ (void)(card); SNDRV_CTL_POWER_D0; }) #define snd_power_change_state(card, state) do { (void)(card); } while (0) #endif /* CONFIG_PM */ struct snd_minor { int type; /* SNDRV_DEVICE_TYPE_XXX */ int card; /* card number */ int device; /* device number */ const struct file_operations *f_ops; /* file operations */ void *private_data; /* private data for f_ops->open */ struct device *dev; /* device for sysfs */ struct snd_card *card_ptr; /* assigned card instance */ }; /* return a device pointer linked to each sound device as a parent */ static inline struct device *snd_card_get_device_link(struct snd_card *card) { return card ? &card->card_dev : NULL; } /* sound.c */ extern int snd_major; extern int snd_ecards_limit; extern struct class *sound_class; void snd_request_card(int card); void snd_device_initialize(struct device *dev, struct snd_card *card); int snd_register_device(int type, struct snd_card *card, int dev, const struct file_operations *f_ops, void *private_data, struct device *device); int snd_unregister_device(struct device *dev); void *snd_lookup_minor_data(unsigned int minor, int type); #ifdef CONFIG_SND_OSSEMUL int snd_register_oss_device(int type, struct snd_card *card, int dev, const struct file_operations *f_ops, void *private_data); int snd_unregister_oss_device(int type, struct snd_card *card, int dev); void *snd_lookup_oss_minor_data(unsigned int minor, int type); #endif int snd_minor_info_init(void); /* sound_oss.c */ #ifdef CONFIG_SND_OSSEMUL int snd_minor_info_oss_init(void); #else static inline int snd_minor_info_oss_init(void) { return 0; } #endif /* memory.c */ int copy_to_user_fromio(void __user *dst, const volatile void __iomem *src, size_t count); int copy_from_user_toio(volatile void __iomem *dst, const void __user *src, size_t count); /* init.c */ int snd_card_locked(int card); #if IS_ENABLED(CONFIG_SND_MIXER_OSS) #define SND_MIXER_OSS_NOTIFY_REGISTER 0 #define SND_MIXER_OSS_NOTIFY_DISCONNECT 1 #define SND_MIXER_OSS_NOTIFY_FREE 2 extern int (*snd_mixer_oss_notify_callback)(struct snd_card *card, int cmd); #endif int snd_card_new(struct device *parent, int idx, const char *xid, struct module *module, int extra_size, struct snd_card **card_ret); int snd_card_disconnect(struct snd_card *card); void snd_card_disconnect_sync(struct snd_card *card); int snd_card_free(struct snd_card *card); int snd_card_free_when_closed(struct snd_card *card); void snd_card_set_id(struct snd_card *card, const char *id); int snd_card_register(struct snd_card *card); int snd_card_info_init(void); int snd_card_add_dev_attr(struct snd_card *card, const struct attribute_group *group); int snd_component_add(struct snd_card *card, const char *component); int snd_card_file_add(struct snd_card *card, struct file *file); int snd_card_file_remove(struct snd_card *card, struct file *file); struct snd_card *snd_card_ref(int card); /** * snd_card_unref - Unreference the card object * @card: the card object to unreference * * Call this function for the card object that was obtained via snd_card_ref() * or snd_lookup_minor_data(). */ static inline void snd_card_unref(struct snd_card *card) { put_device(&card->card_dev); } #define snd_card_set_dev(card, devptr) ((card)->dev = (devptr)) /* device.c */ int snd_device_new(struct snd_card *card, enum snd_device_type type, void *device_data, const struct snd_device_ops *ops); int snd_device_register(struct snd_card *card, void *device_data); int snd_device_register_all(struct snd_card *card); void snd_device_disconnect(struct snd_card *card, void *device_data); void snd_device_disconnect_all(struct snd_card *card); void snd_device_free(struct snd_card *card, void *device_data); void snd_device_free_all(struct snd_card *card); int snd_device_get_state(struct snd_card *card, void *device_data); /* isadma.c */ #ifdef CONFIG_ISA_DMA_API #define DMA_MODE_NO_ENABLE 0x0100 void snd_dma_program(unsigned long dma, unsigned long addr, unsigned int size, unsigned short mode); void snd_dma_disable(unsigned long dma); unsigned int snd_dma_pointer(unsigned long dma, unsigned int size); #endif /* misc.c */ struct resource; void release_and_free_resource(struct resource *res); /* --- */ /* sound printk debug levels */ enum { SND_PR_ALWAYS, SND_PR_DEBUG, SND_PR_VERBOSE, }; #if defined(CONFIG_SND_DEBUG) || defined(CONFIG_SND_VERBOSE_PRINTK) __printf(4, 5) void __snd_printk(unsigned int level, const char *file, int line, const char *format, ...); #else #define __snd_printk(level, file, line, format, ...) \ printk(format, ##__VA_ARGS__) #endif /** * snd_printk - printk wrapper * @fmt: format string * * Works like printk() but prints the file and the line of the caller * when configured with CONFIG_SND_VERBOSE_PRINTK. */ #define snd_printk(fmt, ...) \ __snd_printk(0, __FILE__, __LINE__, fmt, ##__VA_ARGS__) #ifdef CONFIG_SND_DEBUG /** * snd_printd - debug printk * @fmt: format string * * Works like snd_printk() for debugging purposes. * Ignored when CONFIG_SND_DEBUG is not set. */ #define snd_printd(fmt, ...) \ __snd_printk(1, __FILE__, __LINE__, fmt, ##__VA_ARGS__) #define _snd_printd(level, fmt, ...) \ __snd_printk(level, __FILE__, __LINE__, fmt, ##__VA_ARGS__) /** * snd_BUG - give a BUG warning message and stack trace * * Calls WARN() if CONFIG_SND_DEBUG is set. * Ignored when CONFIG_SND_DEBUG is not set. */ #define snd_BUG() WARN(1, "BUG?\n") /** * snd_printd_ratelimit - Suppress high rates of output when * CONFIG_SND_DEBUG is enabled. */ #define snd_printd_ratelimit() printk_ratelimit() /** * snd_BUG_ON - debugging check macro * @cond: condition to evaluate * * Has the same behavior as WARN_ON when CONFIG_SND_DEBUG is set, * otherwise just evaluates the conditional and returns the value. */ #define snd_BUG_ON(cond) WARN_ON((cond)) #else /* !CONFIG_SND_DEBUG */ __printf(1, 2) static inline void snd_printd(const char *format, ...) {} __printf(2, 3) static inline void _snd_printd(int level, const char *format, ...) {} #define snd_BUG() do { } while (0) #define snd_BUG_ON(condition) ({ \ int __ret_warn_on = !!(condition); \ unlikely(__ret_warn_on); \ }) static inline bool snd_printd_ratelimit(void) { return false; } #endif /* CONFIG_SND_DEBUG */ #ifdef CONFIG_SND_DEBUG_VERBOSE /** * snd_printdd - debug printk * @format: format string * * Works like snd_printk() for debugging purposes. * Ignored when CONFIG_SND_DEBUG_VERBOSE is not set. */ #define snd_printdd(format, ...) \ __snd_printk(2, __FILE__, __LINE__, format, ##__VA_ARGS__) #else __printf(1, 2) static inline void snd_printdd(const char *format, ...) {} #endif #define SNDRV_OSS_VERSION ((3<<16)|(8<<8)|(1<<4)|(0)) /* 3.8.1a */ /* for easier backward-porting */ #if IS_ENABLED(CONFIG_GAMEPORT) #define gameport_set_dev_parent(gp,xdev) ((gp)->dev.parent = (xdev)) #define gameport_set_port_data(gp,r) ((gp)->port_data = (r)) #define gameport_get_port_data(gp) (gp)->port_data #endif /* PCI quirk list helper */ struct snd_pci_quirk { unsigned short subvendor; /* PCI subvendor ID */ unsigned short subdevice; /* PCI subdevice ID */ unsigned short subdevice_mask; /* bitmask to match */ int value; /* value */ #ifdef CONFIG_SND_DEBUG_VERBOSE const char *name; /* name of the device (optional) */ #endif }; #define _SND_PCI_QUIRK_ID_MASK(vend, mask, dev) \ .subvendor = (vend), .subdevice = (dev), .subdevice_mask = (mask) #define _SND_PCI_QUIRK_ID(vend, dev) \ _SND_PCI_QUIRK_ID_MASK(vend, 0xffff, dev) #define SND_PCI_QUIRK_ID(vend,dev) {_SND_PCI_QUIRK_ID(vend, dev)} #ifdef CONFIG_SND_DEBUG_VERBOSE #define SND_PCI_QUIRK(vend,dev,xname,val) \ {_SND_PCI_QUIRK_ID(vend, dev), .value = (val), .name = (xname)} #define SND_PCI_QUIRK_VENDOR(vend, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, 0, 0), .value = (val), .name = (xname)} #define SND_PCI_QUIRK_MASK(vend, mask, dev, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, mask, dev), \ .value = (val), .name = (xname)} #define snd_pci_quirk_name(q) ((q)->name) #else #define SND_PCI_QUIRK(vend,dev,xname,val) \ {_SND_PCI_QUIRK_ID(vend, dev), .value = (val)} #define SND_PCI_QUIRK_MASK(vend, mask, dev, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, mask, dev), .value = (val)} #define SND_PCI_QUIRK_VENDOR(vend, xname, val) \ {_SND_PCI_QUIRK_ID_MASK(vend, 0, 0), .value = (val)} #define snd_pci_quirk_name(q) "" #endif #ifdef CONFIG_PCI const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list); const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list); #else static inline const struct snd_pci_quirk * snd_pci_quirk_lookup(struct pci_dev *pci, const struct snd_pci_quirk *list) { return NULL; } static inline const struct snd_pci_quirk * snd_pci_quirk_lookup_id(u16 vendor, u16 device, const struct snd_pci_quirk *list) { return NULL; } #endif #endif /* __SOUND_CORE_H */
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 // SPDX-License-Identifier: GPL-2.0 /* * bus.c - bus driver management * * Copyright (c) 2002-3 Patrick Mochel * Copyright (c) 2002-3 Open Source Development Labs * Copyright (c) 2007 Greg Kroah-Hartman <gregkh@suse.de> * Copyright (c) 2007 Novell Inc. */ #include <linux/async.h> #include <linux/device/bus.h> #include <linux/device.h> #include <linux/module.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/string.h> #include <linux/mutex.h> #include <linux/sysfs.h> #include "base.h" #include "power/power.h" /* /sys/devices/system */ static struct kset *system_kset; #define to_bus_attr(_attr) container_of(_attr, struct bus_attribute, attr) /* * sysfs bindings for drivers */ #define to_drv_attr(_attr) container_of(_attr, struct driver_attribute, attr) #define DRIVER_ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) \ struct driver_attribute driver_attr_##_name = \ __ATTR_IGNORE_LOCKDEP(_name, _mode, _show, _store) static int __must_check bus_rescan_devices_helper(struct device *dev, void *data); static struct bus_type *bus_get(struct bus_type *bus) { if (bus) { kset_get(&bus->p->subsys); return bus; } return NULL; } static void bus_put(struct bus_type *bus) { if (bus) kset_put(&bus->p->subsys); } static ssize_t drv_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->show) ret = drv_attr->show(drv_priv->driver, buf); return ret; } static ssize_t drv_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct driver_attribute *drv_attr = to_drv_attr(attr); struct driver_private *drv_priv = to_driver(kobj); ssize_t ret = -EIO; if (drv_attr->store) ret = drv_attr->store(drv_priv->driver, buf, count); return ret; } static const struct sysfs_ops driver_sysfs_ops = { .show = drv_attr_show, .store = drv_attr_store, }; static void driver_release(struct kobject *kobj) { struct driver_private *drv_priv = to_driver(kobj); pr_debug("driver: '%s': %s\n", kobject_name(kobj), __func__); kfree(drv_priv); } static struct kobj_type driver_ktype = { .sysfs_ops = &driver_sysfs_ops, .release = driver_release, }; /* * sysfs bindings for buses */ static ssize_t bus_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); ssize_t ret = 0; if (bus_attr->show) ret = bus_attr->show(subsys_priv->bus, buf); return ret; } static ssize_t bus_attr_store(struct kobject *kobj, struct attribute *attr, const char *buf, size_t count) { struct bus_attribute *bus_attr = to_bus_attr(attr); struct subsys_private *subsys_priv = to_subsys_private(kobj); ssize_t ret = 0; if (bus_attr->store) ret = bus_attr->store(subsys_priv->bus, buf, count); return ret; } static const struct sysfs_ops bus_sysfs_ops = { .show = bus_attr_show, .store = bus_attr_store, }; int bus_create_file(struct bus_type *bus, struct bus_attribute *attr) { int error; if (bus_get(bus)) { error = sysfs_create_file(&bus->p->subsys.kobj, &attr->attr); bus_put(bus); } else error = -EINVAL; return error; } EXPORT_SYMBOL_GPL(bus_create_file); void bus_remove_file(struct bus_type *bus, struct bus_attribute *attr) { if (bus_get(bus)) { sysfs_remove_file(&bus->p->subsys.kobj, &attr->attr); bus_put(bus); } } EXPORT_SYMBOL_GPL(bus_remove_file); static void bus_release(struct kobject *kobj) { struct subsys_private *priv = to_subsys_private(kobj); struct bus_type *bus = priv->bus; kfree(priv); bus->p = NULL; } static struct kobj_type bus_ktype = { .sysfs_ops = &bus_sysfs_ops, .release = bus_release, }; static int bus_uevent_filter(struct kset *kset, struct kobject *kobj) { struct kobj_type *ktype = get_ktype(kobj); if (ktype == &bus_ktype) return 1; return 0; } static const struct kset_uevent_ops bus_uevent_ops = { .filter = bus_uevent_filter, }; static struct kset *bus_kset; /* Manually detach a device from its associated driver. */ static ssize_t unbind_store(struct device_driver *drv, const char *buf, size_t count) { struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == drv) { device_driver_detach(dev); err = count; } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(unbind, S_IWUSR, NULL, unbind_store); /* * Manually attach a device to a driver. * Note: the driver must want to bind to the device, * it is not possible to override the driver's id table. */ static ssize_t bind_store(struct device_driver *drv, const char *buf, size_t count) { struct bus_type *bus = bus_get(drv->bus); struct device *dev; int err = -ENODEV; dev = bus_find_device_by_name(bus, NULL, buf); if (dev && dev->driver == NULL && driver_match_device(drv, dev)) { err = device_driver_attach(drv, dev); if (err > 0) { /* success */ err = count; } else if (err == 0) { /* driver didn't accept device */ err = -ENODEV; } } put_device(dev); bus_put(bus); return err; } static DRIVER_ATTR_IGNORE_LOCKDEP(bind, S_IWUSR, NULL, bind_store); static ssize_t drivers_autoprobe_show(struct bus_type *bus, char *buf) { return sysfs_emit(buf, "%d\n", bus->p->drivers_autoprobe); } static ssize_t drivers_autoprobe_store(struct bus_type *bus, const char *buf, size_t count) { if (buf[0] == '0') bus->p->drivers_autoprobe = 0; else bus->p->drivers_autoprobe = 1; return count; } static ssize_t drivers_probe_store(struct bus_type *bus, const char *buf, size_t count) { struct device *dev; int err = -EINVAL; dev = bus_find_device_by_name(bus, NULL, buf); if (!dev) return -ENODEV; if (bus_rescan_devices_helper(dev, NULL) == 0) err = count; put_device(dev); return err; } static struct device *next_device(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct device *dev = NULL; struct device_private *dev_prv; if (n) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; } return dev; } /** * bus_for_each_dev - device iterator. * @bus: bus type. * @start: device to start iterating from. * @data: data for the callback. * @fn: function to be called for each device. * * Iterate over @bus's list of devices, and call @fn for each, * passing it @data. If @start is not NULL, we use that device to * begin iterating from. * * We check the return of @fn each time. If it returns anything * other than 0, we break out and return that value. * * NOTE: The device that returns a non-zero value is not retained * in any way, nor is its refcount incremented. If the caller needs * to retain this data, it should do so, and increment the reference * count in the supplied callback. */ int bus_for_each_dev(struct bus_type *bus, struct device *start, void *data, int (*fn)(struct device *, void *)) { struct klist_iter i; struct device *dev; int error = 0; if (!bus || !bus->p) return -EINVAL; klist_iter_init_node(&bus->p->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while (!error && (dev = next_device(&i))) error = fn(dev, data); klist_iter_exit(&i); return error; } EXPORT_SYMBOL_GPL(bus_for_each_dev); /** * bus_find_device - device iterator for locating a particular device. * @bus: bus type * @start: Device to begin with * @data: Data to pass to match function * @match: Callback function to check device * * This is similar to the bus_for_each_dev() function above, but it * returns a reference to a device that is 'found' for later use, as * determined by the @match callback. * * The callback should return 0 if the device doesn't match and non-zero * if it does. If the callback returns non-zero, this function will * return to the caller and not iterate over any more devices. */ struct device *bus_find_device(struct bus_type *bus, struct device *start, const void *data, int (*match)(struct device *dev, const void *data)) { struct klist_iter i; struct device *dev; if (!bus || !bus->p) return NULL; klist_iter_init_node(&bus->p->klist_devices, &i, (start ? &start->p->knode_bus : NULL)); while ((dev = next_device(&i))) if (match(dev, data) && get_device(dev)) break; klist_iter_exit(&i); return dev; } EXPORT_SYMBOL_GPL(bus_find_device); /** * subsys_find_device_by_id - find a device with a specific enumeration number * @subsys: subsystem * @id: index 'id' in struct device * @hint: device to check first * * Check the hint's next object and if it is a match return it directly, * otherwise, fall back to a full list search. Either way a reference for * the returned object is taken. */ struct device *subsys_find_device_by_id(struct bus_type *subsys, unsigned int id, struct device *hint) { struct klist_iter i; struct device *dev; if (!subsys) return NULL; if (hint) { klist_iter_init_node(&subsys->p->klist_devices, &i, &hint->p->knode_bus); dev = next_device(&i); if (dev && dev->id == id && get_device(dev)) { klist_iter_exit(&i); return dev; } klist_iter_exit(&i); } klist_iter_init_node(&subsys->p->klist_devices, &i, NULL); while ((dev = next_device(&i))) { if (dev->id == id && get_device(dev)) { klist_iter_exit(&i); return dev; } } klist_iter_exit(&i); return NULL; } EXPORT_SYMBOL_GPL(subsys_find_device_by_id); static struct device_driver *next_driver(struct klist_iter *i) { struct klist_node *n = klist_next(i); struct driver_private *drv_priv; if (n) { drv_priv = container_of(n, struct driver_private, knode_bus); return drv_priv->driver; } return NULL; } /** * bus_for_each_drv - driver iterator * @bus: bus we're dealing with. * @start: driver to start iterating on. * @data: data to pass to the callback. * @fn: function to call for each driver. * * This is nearly identical to the device iterator above. * We iterate over each driver that belongs to @bus, and call * @fn for each. If @fn returns anything but 0, we break out * and return it. If @start is not NULL, we use it as the head * of the list. * * NOTE: we don't return the driver that returns a non-zero * value, nor do we leave the reference count incremented for that * driver. If the caller needs to know that info, it must set it * in the callback. It must also be sure to increment the refcount * so it doesn't disappear before returning to the caller. */ int bus_for_each_drv(struct bus_type *bus, struct device_driver *start, void *data, int (*fn)(struct device_driver *, void *)) { struct klist_iter i; struct device_driver *drv; int error = 0; if (!bus) return -EINVAL; klist_iter_init_node(&bus->p->klist_drivers, &i, start ? &start->p->knode_bus : NULL); while ((drv = next_driver(&i)) && !error) error = fn(drv, data); klist_iter_exit(&i); return error; } EXPORT_SYMBOL_GPL(bus_for_each_drv); /** * bus_add_device - add device to bus * @dev: device being added * * - Add device's bus attributes. * - Create links to device's bus. * - Add the device to its bus's list of devices. */ int bus_add_device(struct device *dev) { struct bus_type *bus = bus_get(dev->bus); int error = 0; if (bus) { pr_debug("bus: '%s': add device %s\n", bus->name, dev_name(dev)); error = device_add_groups(dev, bus->dev_groups); if (error) goto out_put; error = sysfs_create_link(&bus->p->devices_kset->kobj, &dev->kobj, dev_name(dev)); if (error) goto out_groups; error = sysfs_create_link(&dev->kobj, &dev->bus->p->subsys.kobj, "subsystem"); if (error) goto out_subsys; klist_add_tail(&dev->p->knode_bus, &bus->p->klist_devices); } return 0; out_subsys: sysfs_remove_link(&bus->p->devices_kset->kobj, dev_name(dev)); out_groups: device_remove_groups(dev, bus->dev_groups); out_put: bus_put(dev->bus); return error; } /** * bus_probe_device - probe drivers for a new device * @dev: device to probe * * - Automatically probe for a driver if the bus allows it. */ void bus_probe_device(struct device *dev) { struct bus_type *bus = dev->bus; struct subsys_interface *sif; if (!bus) return; if (bus->p->drivers_autoprobe) device_initial_probe(dev); mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->add_dev) sif->add_dev(dev, sif); mutex_unlock(&bus->p->mutex); } /** * bus_remove_device - remove device from bus * @dev: device to be removed * * - Remove device from all interfaces. * - Remove symlink from bus' directory. * - Delete device from bus's list. * - Detach from its driver. * - Drop reference taken in bus_add_device(). */ void bus_remove_device(struct device *dev) { struct bus_type *bus = dev->bus; struct subsys_interface *sif; if (!bus) return; mutex_lock(&bus->p->mutex); list_for_each_entry(sif, &bus->p->interfaces, node) if (sif->remove_dev) sif->remove_dev(dev, sif); mutex_unlock(&bus->p->mutex); sysfs_remove_link(&dev->kobj, "subsystem"); sysfs_remove_link(&dev->bus->p->devices_kset->kobj, dev_name(dev)); device_remove_groups(dev, dev->bus->dev_groups); if (klist_node_attached(&dev->p->knode_bus)) klist_del(&dev->p->knode_bus); pr_debug("bus: '%s': remove device %s\n", dev->bus->name, dev_name(dev)); device_release_driver(dev); bus_put(dev->bus); } static int __must_check add_bind_files(struct device_driver *drv) { int ret; ret = driver_create_file(drv, &driver_attr_unbind); if (ret == 0) { ret = driver_create_file(drv, &driver_attr_bind); if (ret) driver_remove_file(drv, &driver_attr_unbind); } return ret; } static void remove_bind_files(struct device_driver *drv) { driver_remove_file(drv, &driver_attr_bind); driver_remove_file(drv, &driver_attr_unbind); } static BUS_ATTR_WO(drivers_probe); static BUS_ATTR_RW(drivers_autoprobe); static int add_probe_files(struct bus_type *bus) { int retval; retval = bus_create_file(bus, &bus_attr_drivers_probe); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_drivers_autoprobe); if (retval) bus_remove_file(bus, &bus_attr_drivers_probe); out: return retval; } static void remove_probe_files(struct bus_type *bus) { bus_remove_file(bus, &bus_attr_drivers_autoprobe); bus_remove_file(bus, &bus_attr_drivers_probe); } static ssize_t uevent_store(struct device_driver *drv, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&drv->p->kobj, buf, count); return rc ? rc : count; } static DRIVER_ATTR_WO(uevent); /** * bus_add_driver - Add a driver to the bus. * @drv: driver. */ int bus_add_driver(struct device_driver *drv) { struct bus_type *bus; struct driver_private *priv; int error = 0; bus = bus_get(drv->bus); if (!bus) return -EINVAL; pr_debug("bus: '%s': add driver %s\n", bus->name, drv->name); priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) { error = -ENOMEM; goto out_put_bus; } klist_init(&priv->klist_devices, NULL, NULL); priv->driver = drv; drv->p = priv; priv->kobj.kset = bus->p->drivers_kset; error = kobject_init_and_add(&priv->kobj, &driver_ktype, NULL, "%s", drv->name); if (error) goto out_unregister; klist_add_tail(&priv->knode_bus, &bus->p->klist_drivers); if (drv->bus->p->drivers_autoprobe) { error = driver_attach(drv); if (error) goto out_unregister; } module_add_driver(drv->owner, drv); error = driver_create_file(drv, &driver_attr_uevent); if (error) { printk(KERN_ERR "%s: uevent attr (%s) failed\n", __func__, drv->name); } error = driver_add_groups(drv, bus->drv_groups); if (error) { /* How the hell do we get out of this pickle? Give up */ printk(KERN_ERR "%s: driver_create_groups(%s) failed\n", __func__, drv->name); } if (!drv->suppress_bind_attrs) { error = add_bind_files(drv); if (error) { /* Ditto */ printk(KERN_ERR "%s: add_bind_files(%s) failed\n", __func__, drv->name); } } return 0; out_unregister: kobject_put(&priv->kobj); /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: bus_put(bus); return error; } /** * bus_remove_driver - delete driver from bus's knowledge. * @drv: driver. * * Detach the driver from the devices it controls, and remove * it from its bus's list of drivers. Finally, we drop the reference * to the bus we took in bus_add_driver(). */ void bus_remove_driver(struct device_driver *drv) { if (!drv->bus) return; if (!drv->suppress_bind_attrs) remove_bind_files(drv); driver_remove_groups(drv, drv->bus->drv_groups); driver_remove_file(drv, &driver_attr_uevent); klist_remove(&drv->p->knode_bus); pr_debug("bus: '%s': remove driver %s\n", drv->bus->name, drv->name); driver_detach(drv); module_remove_driver(drv); kobject_put(&drv->p->kobj); bus_put(drv->bus); } /* Helper for bus_rescan_devices's iter */ static int __must_check bus_rescan_devices_helper(struct device *dev, void *data) { int ret = 0; if (!dev->driver) { if (dev->parent && dev->bus->need_parent_lock) device_lock(dev->parent); ret = device_attach(dev); if (dev->parent && dev->bus->need_parent_lock) device_unlock(dev->parent); } return ret < 0 ? ret : 0; } /** * bus_rescan_devices - rescan devices on the bus for possible drivers * @bus: the bus to scan. * * This function will look for devices on the bus with no driver * attached and rescan it against existing drivers to see if it matches * any by calling device_attach() for the unbound devices. */ int bus_rescan_devices(struct bus_type *bus) { return bus_for_each_dev(bus, NULL, NULL, bus_rescan_devices_helper); } EXPORT_SYMBOL_GPL(bus_rescan_devices); /** * device_reprobe - remove driver for a device and probe for a new driver * @dev: the device to reprobe * * This function detaches the attached driver (if any) for the given * device and restarts the driver probing process. It is intended * to use if probing criteria changed during a devices lifetime and * driver attachment should change accordingly. */ int device_reprobe(struct device *dev) { if (dev->driver) device_driver_detach(dev); return bus_rescan_devices_helper(dev, NULL); } EXPORT_SYMBOL_GPL(device_reprobe); /** * find_bus - locate bus by name. * @name: name of bus. * * Call kset_find_obj() to iterate over list of buses to * find a bus by name. Return bus if found. * * Note that kset_find_obj increments bus' reference count. */ #if 0 struct bus_type *find_bus(char *name) { struct kobject *k = kset_find_obj(bus_kset, name); return k ? to_bus(k) : NULL; } #endif /* 0 */ static int bus_add_groups(struct bus_type *bus, const struct attribute_group **groups) { return sysfs_create_groups(&bus->p->subsys.kobj, groups); } static void bus_remove_groups(struct bus_type *bus, const struct attribute_group **groups) { sysfs_remove_groups(&bus->p->subsys.kobj, groups); } static void klist_devices_get(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; get_device(dev); } static void klist_devices_put(struct klist_node *n) { struct device_private *dev_prv = to_device_private_bus(n); struct device *dev = dev_prv->device; put_device(dev); } static ssize_t bus_uevent_store(struct bus_type *bus, const char *buf, size_t count) { int rc; rc = kobject_synth_uevent(&bus->p->subsys.kobj, buf, count); return rc ? rc : count; } /* * "open code" the old BUS_ATTR() macro here. We want to use BUS_ATTR_WO() * here, but can not use it as earlier in the file we have * DEVICE_ATTR_WO(uevent), which would cause a clash with the with the store * function name. */ static struct bus_attribute bus_attr_uevent = __ATTR(uevent, S_IWUSR, NULL, bus_uevent_store); /** * bus_register - register a driver-core subsystem * @bus: bus to register * * Once we have that, we register the bus with the kobject * infrastructure, then register the children subsystems it has: * the devices and drivers that belong to the subsystem. */ int bus_register(struct bus_type *bus) { int retval; struct subsys_private *priv; struct lock_class_key *key = &bus->lock_key; priv = kzalloc(sizeof(struct subsys_private), GFP_KERNEL); if (!priv) return -ENOMEM; priv->bus = bus; bus->p = priv; BLOCKING_INIT_NOTIFIER_HEAD(&priv->bus_notifier); retval = kobject_set_name(&priv->subsys.kobj, "%s", bus->name); if (retval) goto out; priv->subsys.kobj.kset = bus_kset; priv->subsys.kobj.ktype = &bus_ktype; priv->drivers_autoprobe = 1; retval = kset_register(&priv->subsys); if (retval) goto out; retval = bus_create_file(bus, &bus_attr_uevent); if (retval) goto bus_uevent_fail; priv->devices_kset = kset_create_and_add("devices", NULL, &priv->subsys.kobj); if (!priv->devices_kset) { retval = -ENOMEM; goto bus_devices_fail; } priv->drivers_kset = kset_create_and_add("drivers", NULL, &priv->subsys.kobj); if (!priv->drivers_kset) { retval = -ENOMEM; goto bus_drivers_fail; } INIT_LIST_HEAD(&priv->interfaces); __mutex_init(&priv->mutex, "subsys mutex", key); klist_init(&priv->klist_devices, klist_devices_get, klist_devices_put); klist_init(&priv->klist_drivers, NULL, NULL); retval = add_probe_files(bus); if (retval) goto bus_probe_files_fail; retval = bus_add_groups(bus, bus->bus_groups); if (retval) goto bus_groups_fail; pr_debug("bus: '%s': registered\n", bus->name); return 0; bus_groups_fail: remove_probe_files(bus); bus_probe_files_fail: kset_unregister(bus->p->drivers_kset); bus_drivers_fail: kset_unregister(bus->p->devices_kset); bus_devices_fail: bus_remove_file(bus, &bus_attr_uevent); bus_uevent_fail: kset_unregister(&bus->p->subsys); out: kfree(bus->p); bus->p = NULL; return retval; } EXPORT_SYMBOL_GPL(bus_register); /** * bus_unregister - remove a bus from the system * @bus: bus. * * Unregister the child subsystems and the bus itself. * Finally, we call bus_put() to release the refcount */ void bus_unregister(struct bus_type *bus) { pr_debug("bus: '%s': unregistering\n", bus->name); if (bus->dev_root) device_unregister(bus->dev_root); bus_remove_groups(bus, bus->bus_groups); remove_probe_files(bus); kset_unregister(bus->p->drivers_kset); kset_unregister(bus->p->devices_kset); bus_remove_file(bus, &bus_attr_uevent); kset_unregister(&bus->p->subsys); } EXPORT_SYMBOL_GPL(bus_unregister); int bus_register_notifier(struct bus_type *bus, struct notifier_block *nb) { return blocking_notifier_chain_register(&bus->p->bus_notifier, nb); } EXPORT_SYMBOL_GPL(bus_register_notifier); int bus_unregister_notifier(struct bus_type *bus, struct notifier_block *nb) { return blocking_notifier_chain_unregister(&bus->p->bus_notifier, nb); } EXPORT_SYMBOL_GPL(bus_unregister_notifier); struct kset *bus_get_kset(struct bus_type *bus) { return &bus->p->subsys; } EXPORT_SYMBOL_GPL(bus_get_kset); struct klist *bus_get_device_klist(struct bus_type *bus) { return &bus->p->klist_devices; } EXPORT_SYMBOL_GPL(bus_get_device_klist); /* * Yes, this forcibly breaks the klist abstraction temporarily. It * just wants to sort the klist, not change reference counts and * take/drop locks rapidly in the process. It does all this while * holding the lock for the list, so objects can't otherwise be * added/removed while we're swizzling. */ static void device_insertion_sort_klist(struct device *a, struct list_head *list, int (*compare)(const struct device *a, const struct device *b)) { struct klist_node *n; struct device_private *dev_prv; struct device *b; list_for_each_entry(n, list, n_node) { dev_prv = to_device_private_bus(n); b = dev_prv->device; if (compare(a, b) <= 0) { list_move_tail(&a->p->knode_bus.n_node, &b->p->knode_bus.n_node); return; } } list_move_tail(&a->p->knode_bus.n_node, list); } void bus_sort_breadthfirst(struct bus_type *bus, int (*compare)(const struct device *a, const struct device *b)) { LIST_HEAD(sorted_devices); struct klist_node *n, *tmp; struct device_private *dev_prv; struct device *dev; struct klist *device_klist; device_klist = bus_get_device_klist(bus); spin_lock(&device_klist->k_lock); list_for_each_entry_safe(n, tmp, &device_klist->k_list, n_node) { dev_prv = to_device_private_bus(n); dev = dev_prv->device; device_insertion_sort_klist(dev, &sorted_devices, compare); } list_splice(&sorted_devices, &device_klist->k_list); spin_unlock(&device_klist->k_lock); } EXPORT_SYMBOL_GPL(bus_sort_breadthfirst); /** * subsys_dev_iter_init - initialize subsys device iterator * @iter: subsys iterator to initialize * @subsys: the subsys we wanna iterate over * @start: the device to start iterating from, if any * @type: device_type of the devices to iterate over, NULL for all * * Initialize subsys iterator @iter such that it iterates over devices * of @subsys. If @start is set, the list iteration will start there, * otherwise if it is NULL, the iteration starts at the beginning of * the list. */ void subsys_dev_iter_init(struct subsys_dev_iter *iter, struct bus_type *subsys, struct device *start, const struct device_type *type) { struct klist_node *start_knode = NULL; if (start) start_knode = &start->p->knode_bus; klist_iter_init_node(&subsys->p->klist_devices, &iter->ki, start_knode); iter->type = type; } EXPORT_SYMBOL_GPL(subsys_dev_iter_init); /** * subsys_dev_iter_next - iterate to the next device * @iter: subsys iterator to proceed * * Proceed @iter to the next device and return it. Returns NULL if * iteration is complete. * * The returned device is referenced and won't be released till * iterator is proceed to the next device or exited. The caller is * free to do whatever it wants to do with the device including * calling back into subsys code. */ struct device *subsys_dev_iter_next(struct subsys_dev_iter *iter) { struct klist_node *knode; struct device *dev; for (;;) { knode = klist_next(&iter->ki); if (!knode) return NULL; dev = to_device_private_bus(knode)->device; if (!iter->type || iter->type == dev->type) return dev; } } EXPORT_SYMBOL_GPL(subsys_dev_iter_next); /** * subsys_dev_iter_exit - finish iteration * @iter: subsys iterator to finish * * Finish an iteration. Always call this function after iteration is * complete whether the iteration ran till the end or not. */ void subsys_dev_iter_exit(struct subsys_dev_iter *iter) { klist_iter_exit(&iter->ki); } EXPORT_SYMBOL_GPL(subsys_dev_iter_exit); int subsys_interface_register(struct subsys_interface *sif) { struct bus_type *subsys; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return -ENODEV; subsys = bus_get(sif->subsys); if (!subsys) return -EINVAL; mutex_lock(&subsys->p->mutex); list_add_tail(&sif->node, &subsys->p->interfaces); if (sif->add_dev) { subsys_dev_iter_init(&iter, subsys, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->add_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&subsys->p->mutex); return 0; } EXPORT_SYMBOL_GPL(subsys_interface_register); void subsys_interface_unregister(struct subsys_interface *sif) { struct bus_type *subsys; struct subsys_dev_iter iter; struct device *dev; if (!sif || !sif->subsys) return; subsys = sif->subsys; mutex_lock(&subsys->p->mutex); list_del_init(&sif->node); if (sif->remove_dev) { subsys_dev_iter_init(&iter, subsys, NULL, NULL); while ((dev = subsys_dev_iter_next(&iter))) sif->remove_dev(dev, sif); subsys_dev_iter_exit(&iter); } mutex_unlock(&subsys->p->mutex); bus_put(subsys); } EXPORT_SYMBOL_GPL(subsys_interface_unregister); static void system_root_device_release(struct device *dev) { kfree(dev); } static int subsys_register(struct bus_type *subsys, const struct attribute_group **groups, struct kobject *parent_of_root) { struct device *dev; int err; err = bus_register(subsys); if (err < 0) return err; dev = kzalloc(sizeof(struct device), GFP_KERNEL); if (!dev) { err = -ENOMEM; goto err_dev; } err = dev_set_name(dev, "%s", subsys->name); if (err < 0) goto err_name; dev->kobj.parent = parent_of_root; dev->groups = groups; dev->release = system_root_device_release; err = device_register(dev); if (err < 0) goto err_dev_reg; subsys->dev_root = dev; return 0; err_dev_reg: put_device(dev); dev = NULL; err_name: kfree(dev); err_dev: bus_unregister(subsys); return err; } /** * subsys_system_register - register a subsystem at /sys/devices/system/ * @subsys: system subsystem * @groups: default attributes for the root device * * All 'system' subsystems have a /sys/devices/system/<name> root device * with the name of the subsystem. The root device can carry subsystem- * wide attributes. All registered devices are below this single root * device and are named after the subsystem with a simple enumeration * number appended. The registered devices are not explicitly named; * only 'id' in the device needs to be set. * * Do not use this interface for anything new, it exists for compatibility * with bad ideas only. New subsystems should use plain subsystems; and * add the subsystem-wide attributes should be added to the subsystem * directory itself and not some create fake root-device placed in * /sys/devices/system/<name>. */ int subsys_system_register(struct bus_type *subsys, const struct attribute_group **groups) { return subsys_register(subsys, groups, &system_kset->kobj); } EXPORT_SYMBOL_GPL(subsys_system_register); /** * subsys_virtual_register - register a subsystem at /sys/devices/virtual/ * @subsys: virtual subsystem * @groups: default attributes for the root device * * All 'virtual' subsystems have a /sys/devices/system/<name> root device * with the name of the subystem. The root device can carry subsystem-wide * attributes. All registered devices are below this single root device. * There's no restriction on device naming. This is for kernel software * constructs which need sysfs interface. */ int subsys_virtual_register(struct bus_type *subsys, const struct attribute_group **groups) { struct kobject *virtual_dir; virtual_dir = virtual_device_parent(NULL); if (!virtual_dir) return -ENOMEM; return subsys_register(subsys, groups, virtual_dir); } EXPORT_SYMBOL_GPL(subsys_virtual_register); int __init buses_init(void) { bus_kset = kset_create_and_add("bus", &bus_uevent_ops, NULL); if (!bus_kset) return -ENOMEM; system_kset = kset_create_and_add("system", NULL, &devices_kset->kobj); if (!system_kset) return -ENOMEM; return 0; }
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 /* SPDX-License-Identifier: GPL-2.0 */ /* * Runtime locking correctness validator * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * see Documentation/locking/lockdep-design.rst for more details. */ #ifndef __LINUX_LOCKDEP_H #define __LINUX_LOCKDEP_H #include <linux/lockdep_types.h> #include <linux/smp.h> #include <asm/percpu.h> struct task_struct; /* for sysctl */ extern int prove_locking; extern int lock_stat; #ifdef CONFIG_LOCKDEP #include <linux/linkage.h> #include <linux/list.h> #include <linux/debug_locks.h> #include <linux/stacktrace.h> static inline void lockdep_copy_map(struct lockdep_map *to, struct lockdep_map *from) { int i; *to = *from; /* * Since the class cache can be modified concurrently we could observe * half pointers (64bit arch using 32bit copy insns). Therefore clear * the caches and take the performance hit. * * XXX it doesn't work well with lockdep_set_class_and_subclass(), since * that relies on cache abuse. */ for (i = 0; i < NR_LOCKDEP_CACHING_CLASSES; i++) to->class_cache[i] = NULL; } /* * Every lock has a list of other locks that were taken after it. * We only grow the list, never remove from it: */ struct lock_list { struct list_head entry; struct lock_class *class; struct lock_class *links_to; const struct lock_trace *trace; u16 distance; /* bitmap of different dependencies from head to this */ u8 dep; /* used by BFS to record whether "prev -> this" only has -(*R)-> */ u8 only_xr; /* * The parent field is used to implement breadth-first search, and the * bit 0 is reused to indicate if the lock has been accessed in BFS. */ struct lock_list *parent; }; /** * struct lock_chain - lock dependency chain record * * @irq_context: the same as irq_context in held_lock below * @depth: the number of held locks in this chain * @base: the index in chain_hlocks for this chain * @entry: the collided lock chains in lock_chain hash list * @chain_key: the hash key of this lock_chain */ struct lock_chain { /* see BUILD_BUG_ON()s in add_chain_cache() */ unsigned int irq_context : 2, depth : 6, base : 24; /* 4 byte hole */ struct hlist_node entry; u64 chain_key; }; #define MAX_LOCKDEP_KEYS_BITS 13 #define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS) #define INITIAL_CHAIN_KEY -1 struct held_lock { /* * One-way hash of the dependency chain up to this point. We * hash the hashes step by step as the dependency chain grows. * * We use it for dependency-caching and we skip detection * passes and dependency-updates if there is a cache-hit, so * it is absolutely critical for 100% coverage of the validator * to have a unique key value for every unique dependency path * that can occur in the system, to make a unique hash value * as likely as possible - hence the 64-bit width. * * The task struct holds the current hash value (initialized * with zero), here we store the previous hash value: */ u64 prev_chain_key; unsigned long acquire_ip; struct lockdep_map *instance; struct lockdep_map *nest_lock; #ifdef CONFIG_LOCK_STAT u64 waittime_stamp; u64 holdtime_stamp; #endif /* * class_idx is zero-indexed; it points to the element in * lock_classes this held lock instance belongs to. class_idx is in * the range from 0 to (MAX_LOCKDEP_KEYS-1) inclusive. */ unsigned int class_idx:MAX_LOCKDEP_KEYS_BITS; /* * The lock-stack is unified in that the lock chains of interrupt * contexts nest ontop of process context chains, but we 'separate' * the hashes by starting with 0 if we cross into an interrupt * context, and we also keep do not add cross-context lock * dependencies - the lock usage graph walking covers that area * anyway, and we'd just unnecessarily increase the number of * dependencies otherwise. [Note: hardirq and softirq contexts * are separated from each other too.] * * The following field is used to detect when we cross into an * interrupt context: */ unsigned int irq_context:2; /* bit 0 - soft, bit 1 - hard */ unsigned int trylock:1; /* 16 bits */ unsigned int read:2; /* see lock_acquire() comment */ unsigned int check:1; /* see lock_acquire() comment */ unsigned int hardirqs_off:1; unsigned int references:12; /* 32 bits */ unsigned int pin_count; }; /* * Initialization, self-test and debugging-output methods: */ extern void lockdep_init(void); extern void lockdep_reset(void); extern void lockdep_reset_lock(struct lockdep_map *lock); extern void lockdep_free_key_range(void *start, unsigned long size); extern asmlinkage void lockdep_sys_exit(void); extern void lockdep_set_selftest_task(struct task_struct *task); extern void lockdep_init_task(struct task_struct *task); /* * Split the recrursion counter in two to readily detect 'off' vs recursion. */ #define LOCKDEP_RECURSION_BITS 16 #define LOCKDEP_OFF (1U << LOCKDEP_RECURSION_BITS) #define LOCKDEP_RECURSION_MASK (LOCKDEP_OFF - 1) /* * lockdep_{off,on}() are macros to avoid tracing and kprobes; not inlines due * to header dependencies. */ #define lockdep_off() \ do { \ current->lockdep_recursion += LOCKDEP_OFF; \ } while (0) #define lockdep_on() \ do { \ current->lockdep_recursion -= LOCKDEP_OFF; \ } while (0) extern void lockdep_register_key(struct lock_class_key *key); extern void lockdep_unregister_key(struct lock_class_key *key); /* * These methods are used by specific locking variants (spinlocks, * rwlocks, mutexes and rwsems) to pass init/acquire/release events * to lockdep: */ extern void lockdep_init_map_type(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer, u8 lock_type); static inline void lockdep_init_map_waits(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner, u8 outer) { lockdep_init_map_type(lock, name, key, subclass, inner, LD_WAIT_INV, LD_LOCK_NORMAL); } static inline void lockdep_init_map_wait(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass, u8 inner) { lockdep_init_map_waits(lock, name, key, subclass, inner, LD_WAIT_INV); } static inline void lockdep_init_map(struct lockdep_map *lock, const char *name, struct lock_class_key *key, int subclass) { lockdep_init_map_wait(lock, name, key, subclass, LD_WAIT_INV); } /* * Reinitialize a lock key - for cases where there is special locking or * special initialization of locks so that the validator gets the scope * of dependencies wrong: they are either too broad (they need a class-split) * or they are too narrow (they suffer from a false class-split): */ #define lockdep_set_class(lock, key) \ lockdep_init_map_waits(&(lock)->dep_map, #key, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_class_and_name(lock, key, name) \ lockdep_init_map_waits(&(lock)->dep_map, name, key, 0, \ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_class_and_subclass(lock, key, sub) \ lockdep_init_map_waits(&(lock)->dep_map, #key, key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_subclass(lock, sub) \ lockdep_init_map_waits(&(lock)->dep_map, #lock, (lock)->dep_map.key, sub,\ (lock)->dep_map.wait_type_inner, \ (lock)->dep_map.wait_type_outer) #define lockdep_set_novalidate_class(lock) \ lockdep_set_class_and_name(lock, &__lockdep_no_validate__, #lock) /* * Compare locking classes */ #define lockdep_match_class(lock, key) lockdep_match_key(&(lock)->dep_map, key) static inline int lockdep_match_key(struct lockdep_map *lock, struct lock_class_key *key) { return lock->key == key; } /* * Acquire a lock. * * Values for "read": * * 0: exclusive (write) acquire * 1: read-acquire (no recursion allowed) * 2: read-acquire with same-instance recursion allowed * * Values for check: * * 0: simple checks (freeing, held-at-exit-time, etc.) * 1: full validation */ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *nest_lock, unsigned long ip); extern void lock_release(struct lockdep_map *lock, unsigned long ip); /* * Same "read" as for lock_acquire(), except -1 means any. */ extern int lock_is_held_type(const struct lockdep_map *lock, int read); static inline int lock_is_held(const struct lockdep_map *lock) { return lock_is_held_type(lock, -1); } #define lockdep_is_held(lock) lock_is_held(&(lock)->dep_map) #define lockdep_is_held_type(lock, r) lock_is_held_type(&(lock)->dep_map, (r)) extern void lock_set_class(struct lockdep_map *lock, const char *name, struct lock_class_key *key, unsigned int subclass, unsigned long ip); static inline void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, unsigned long ip) { lock_set_class(lock, lock->name, lock->key, subclass, ip); } extern void lock_downgrade(struct lockdep_map *lock, unsigned long ip); #define NIL_COOKIE (struct pin_cookie){ .val = 0U, } extern struct pin_cookie lock_pin_lock(struct lockdep_map *lock); extern void lock_repin_lock(struct lockdep_map *lock, struct pin_cookie); extern void lock_unpin_lock(struct lockdep_map *lock, struct pin_cookie); #define lockdep_depth(tsk) (debug_locks ? (tsk)->lockdep_depth : 0) #define lockdep_assert_held(l) do { \ WARN_ON(debug_locks && !lockdep_is_held(l)); \ } while (0) #define lockdep_assert_held_write(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 0)); \ } while (0) #define lockdep_assert_held_read(l) do { \ WARN_ON(debug_locks && !lockdep_is_held_type(l, 1)); \ } while (0) #define lockdep_assert_held_once(l) do { \ WARN_ON_ONCE(debug_locks && !lockdep_is_held(l)); \ } while (0) #define lockdep_recursing(tsk) ((tsk)->lockdep_recursion) #define lockdep_pin_lock(l) lock_pin_lock(&(l)->dep_map) #define lockdep_repin_lock(l,c) lock_repin_lock(&(l)->dep_map, (c)) #define lockdep_unpin_lock(l,c) lock_unpin_lock(&(l)->dep_map, (c)) #else /* !CONFIG_LOCKDEP */ static inline void lockdep_init_task(struct task_struct *task) { } static inline void lockdep_off(void) { } static inline void lockdep_on(void) { } static inline void lockdep_set_selftest_task(struct task_struct *task) { } # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, i) do { } while (0) # define lock_downgrade(l, i) do { } while (0) # define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_init_map_type(lock, name, key, sub, inner, outer, type) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_waits(lock, name, key, sub, inner, outer) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map_wait(lock, name, key, sub, inner) \ do { (void)(name); (void)(key); } while (0) # define lockdep_init_map(lock, name, key, sub) \ do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) #define lockdep_set_novalidate_class(lock) do { } while (0) /* * We don't define lockdep_match_class() and lockdep_match_key() for !LOCKDEP * case since the result is not well defined and the caller should rather * #ifdef the call himself. */ # define lockdep_reset() do { debug_locks = 1; } while (0) # define lockdep_free_key_range(start, size) do { } while (0) # define lockdep_sys_exit() do { } while (0) static inline void lockdep_register_key(struct lock_class_key *key) { } static inline void lockdep_unregister_key(struct lock_class_key *key) { } #define lockdep_depth(tsk) (0) #define lockdep_is_held_type(l, r) (1) #define lockdep_assert_held(l) do { (void)(l); } while (0) #define lockdep_assert_held_write(l) do { (void)(l); } while (0) #define lockdep_assert_held_read(l) do { (void)(l); } while (0) #define lockdep_assert_held_once(l) do { (void)(l); } while (0) #define lockdep_recursing(tsk) (0) #define NIL_COOKIE (struct pin_cookie){ } #define lockdep_pin_lock(l) ({ struct pin_cookie cookie = { }; cookie; }) #define lockdep_repin_lock(l, c) do { (void)(l); (void)(c); } while (0) #define lockdep_unpin_lock(l, c) do { (void)(l); (void)(c); } while (0) #endif /* !LOCKDEP */ enum xhlock_context_t { XHLOCK_HARD, XHLOCK_SOFT, XHLOCK_CTX_NR, }; #define lockdep_init_map_crosslock(m, n, k, s) do {} while (0) /* * To initialize a lockdep_map statically use this macro. * Note that _name must not be NULL. */ #define STATIC_LOCKDEP_MAP_INIT(_name, _key) \ { .name = (_name), .key = (void *)(_key), } static inline void lockdep_invariant_state(bool force) {} static inline void lockdep_free_task(struct task_struct *task) {} #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ ({ \ int ____err = 0; \ if (!try(_lock)) { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ ____err = lock(_lock); \ } \ if (!____err) \ lock_acquired(&(_lock)->dep_map, _RET_IP_); \ ____err; \ }) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) #define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) #define LOCK_CONTENDED_RETURN(_lock, try, lock) \ lock(_lock) #endif /* CONFIG_LOCK_STAT */ #ifdef CONFIG_LOCKDEP /* * On lockdep we dont want the hand-coded irq-enable of * _raw_*_lock_flags() code, because lockdep assumes * that interrupts are not re-enabled during lock-acquire: */ #define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ LOCK_CONTENDED((_lock), (try), (lock)) #else /* CONFIG_LOCKDEP */ #define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \ lockfl((_lock), (flags)) #endif /* CONFIG_LOCKDEP */ #ifdef CONFIG_PROVE_LOCKING extern void print_irqtrace_events(struct task_struct *curr); #else static inline void print_irqtrace_events(struct task_struct *curr) { } #endif /* Variable used to make lockdep treat read_lock() as recursive in selftests */ #ifdef CONFIG_DEBUG_LOCKING_API_SELFTESTS extern unsigned int force_read_lock_recursive; #else /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #define force_read_lock_recursive 0 #endif /* CONFIG_DEBUG_LOCKING_API_SELFTESTS */ #ifdef CONFIG_LOCKDEP extern bool read_lock_is_recursive(void); #else /* CONFIG_LOCKDEP */ /* If !LOCKDEP, the value is meaningless */ #define read_lock_is_recursive() 0 #endif /* * For trivial one-depth nesting of a lock-class, the following * global define can be used. (Subsystems with multiple levels * of nesting should define their own lock-nesting subclasses.) */ #define SINGLE_DEPTH_NESTING 1 /* * Map the dependency ops to NOP or to real lockdep ops, depending * on the per lock-class debug mode: */ #define lock_acquire_exclusive(l, s, t, n, i) lock_acquire(l, s, t, 0, 1, n, i) #define lock_acquire_shared(l, s, t, n, i) lock_acquire(l, s, t, 1, 1, n, i) #define lock_acquire_shared_recursive(l, s, t, n, i) lock_acquire(l, s, t, 2, 1, n, i) #define spin_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define spin_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define spin_release(l, i) lock_release(l, i) #define rwlock_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwlock_acquire_read(l, s, t, i) \ do { \ if (read_lock_is_recursive()) \ lock_acquire_shared_recursive(l, s, t, NULL, i); \ else \ lock_acquire_shared(l, s, t, NULL, i); \ } while (0) #define rwlock_release(l, i) lock_release(l, i) #define seqcount_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define seqcount_acquire_read(l, s, t, i) lock_acquire_shared_recursive(l, s, t, NULL, i) #define seqcount_release(l, i) lock_release(l, i) #define mutex_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define mutex_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define mutex_release(l, i) lock_release(l, i) #define rwsem_acquire(l, s, t, i) lock_acquire_exclusive(l, s, t, NULL, i) #define rwsem_acquire_nest(l, s, t, n, i) lock_acquire_exclusive(l, s, t, n, i) #define rwsem_acquire_read(l, s, t, i) lock_acquire_shared(l, s, t, NULL, i) #define rwsem_release(l, i) lock_release(l, i) #define lock_map_acquire(l) lock_acquire_exclusive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_read(l) lock_acquire_shared_recursive(l, 0, 0, NULL, _THIS_IP_) #define lock_map_acquire_tryread(l) lock_acquire_shared_recursive(l, 0, 1, NULL, _THIS_IP_) #define lock_map_release(l) lock_release(l, _THIS_IP_) #ifdef CONFIG_PROVE_LOCKING # define might_lock(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 0, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_read(lock) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, 0, 0, 1, 1, NULL, _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) # define might_lock_nested(lock, subclass) \ do { \ typecheck(struct lockdep_map *, &(lock)->dep_map); \ lock_acquire(&(lock)->dep_map, subclass, 0, 1, 1, NULL, \ _THIS_IP_); \ lock_release(&(lock)->dep_map, _THIS_IP_); \ } while (0) DECLARE_PER_CPU(int, hardirqs_enabled); DECLARE_PER_CPU(int, hardirq_context); DECLARE_PER_CPU(unsigned int, lockdep_recursion); #define __lockdep_enabled (debug_locks && !this_cpu_read(lockdep_recursion)) #define lockdep_assert_irqs_enabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_irqs_disabled() \ do { \ WARN_ON_ONCE(__lockdep_enabled && this_cpu_read(hardirqs_enabled)); \ } while (0) #define lockdep_assert_in_irq() \ do { \ WARN_ON_ONCE(__lockdep_enabled && !this_cpu_read(hardirq_context)); \ } while (0) #define lockdep_assert_preemption_enabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() != 0 || \ !this_cpu_read(hardirqs_enabled))); \ } while (0) #define lockdep_assert_preemption_disabled() \ do { \ WARN_ON_ONCE(IS_ENABLED(CONFIG_PREEMPT_COUNT) && \ __lockdep_enabled && \ (preempt_count() == 0 && \ this_cpu_read(hardirqs_enabled))); \ } while (0) #else # define might_lock(lock) do { } while (0) # define might_lock_read(lock) do { } while (0) # define might_lock_nested(lock, subclass) do { } while (0) # define lockdep_assert_irqs_enabled() do { } while (0) # define lockdep_assert_irqs_disabled() do { } while (0) # define lockdep_assert_in_irq() do { } while (0) # define lockdep_assert_preemption_enabled() do { } while (0) # define lockdep_assert_preemption_disabled() do { } while (0) #endif #ifdef CONFIG_PROVE_RAW_LOCK_NESTING # define lockdep_assert_RT_in_threaded_ctx() do { \ WARN_ONCE(debug_locks && !current->lockdep_recursion && \ lockdep_hardirq_context() && \ !(current->hardirq_threaded || current->irq_config), \ "Not in threaded context on PREEMPT_RT as expected\n"); \ } while (0) #else # define lockdep_assert_RT_in_threaded_ctx() do { } while (0) #endif #ifdef CONFIG_LOCKDEP void lockdep_rcu_suspicious(const char *file, const int line, const char *s); #else static inline void lockdep_rcu_suspicious(const char *file, const int line, const char *s) { } #endif #endif /* __LINUX_LOCKDEP_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Linux INET6 implementation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> */ #ifndef _NET_IPV6_H #define _NET_IPV6_H #include <linux/ipv6.h> #include <linux/hardirq.h> #include <linux/jhash.h> #include <linux/refcount.h> #include <linux/jump_label_ratelimit.h> #include <net/if_inet6.h> #include <net/ndisc.h> #include <net/flow.h> #include <net/flow_dissector.h> #include <net/snmp.h> #include <net/netns/hash.h> #define SIN6_LEN_RFC2133 24 #define IPV6_MAXPLEN 65535 /* * NextHeader field of IPv6 header */ #define NEXTHDR_HOP 0 /* Hop-by-hop option header. */ #define NEXTHDR_TCP 6 /* TCP segment. */ #define NEXTHDR_UDP 17 /* UDP message. */ #define NEXTHDR_IPV6 41 /* IPv6 in IPv6 */ #define NEXTHDR_ROUTING 43 /* Routing header. */ #define NEXTHDR_FRAGMENT 44 /* Fragmentation/reassembly header. */ #define NEXTHDR_GRE 47 /* GRE header. */ #define NEXTHDR_ESP 50 /* Encapsulating security payload. */ #define NEXTHDR_AUTH 51 /* Authentication header. */ #define NEXTHDR_ICMP 58 /* ICMP for IPv6. */ #define NEXTHDR_NONE 59 /* No next header */ #define NEXTHDR_DEST 60 /* Destination options header. */ #define NEXTHDR_SCTP 132 /* SCTP message. */ #define NEXTHDR_MOBILITY 135 /* Mobility header. */ #define NEXTHDR_MAX 255 #define IPV6_DEFAULT_HOPLIMIT 64 #define IPV6_DEFAULT_MCASTHOPS 1 /* Limits on Hop-by-Hop and Destination options. * * Per RFC8200 there is no limit on the maximum number or lengths of options in * Hop-by-Hop or Destination options other then the packet must fit in an MTU. * We allow configurable limits in order to mitigate potential denial of * service attacks. * * There are three limits that may be set: * - Limit the number of options in a Hop-by-Hop or Destination options * extension header * - Limit the byte length of a Hop-by-Hop or Destination options extension * header * - Disallow unknown options * * The limits are expressed in corresponding sysctls: * * ipv6.sysctl.max_dst_opts_cnt * ipv6.sysctl.max_hbh_opts_cnt * ipv6.sysctl.max_dst_opts_len * ipv6.sysctl.max_hbh_opts_len * * max_*_opts_cnt is the number of TLVs that are allowed for Destination * options or Hop-by-Hop options. If the number is less than zero then unknown * TLVs are disallowed and the number of known options that are allowed is the * absolute value. Setting the value to INT_MAX indicates no limit. * * max_*_opts_len is the length limit in bytes of a Destination or * Hop-by-Hop options extension header. Setting the value to INT_MAX * indicates no length limit. * * If a limit is exceeded when processing an extension header the packet is * silently discarded. */ /* Default limits for Hop-by-Hop and Destination options */ #define IP6_DEFAULT_MAX_DST_OPTS_CNT 8 #define IP6_DEFAULT_MAX_HBH_OPTS_CNT 8 #define IP6_DEFAULT_MAX_DST_OPTS_LEN INT_MAX /* No limit */ #define IP6_DEFAULT_MAX_HBH_OPTS_LEN INT_MAX /* No limit */ /* * Addr type * * type - unicast | multicast * scope - local | site | global * v4 - compat * v4mapped * any * loopback */ #define IPV6_ADDR_ANY 0x0000U #define IPV6_ADDR_UNICAST 0x0001U #define IPV6_ADDR_MULTICAST 0x0002U #define IPV6_ADDR_LOOPBACK 0x0010U #define IPV6_ADDR_LINKLOCAL 0x0020U #define IPV6_ADDR_SITELOCAL 0x0040U #define IPV6_ADDR_COMPATv4 0x0080U #define IPV6_ADDR_SCOPE_MASK 0x00f0U #define IPV6_ADDR_MAPPED 0x1000U /* * Addr scopes */ #define IPV6_ADDR_MC_SCOPE(a) \ ((a)->s6_addr[1] & 0x0f) /* nonstandard */ #define __IPV6_ADDR_SCOPE_INVALID -1 #define IPV6_ADDR_SCOPE_NODELOCAL 0x01 #define IPV6_ADDR_SCOPE_LINKLOCAL 0x02 #define IPV6_ADDR_SCOPE_SITELOCAL 0x05 #define IPV6_ADDR_SCOPE_ORGLOCAL 0x08 #define IPV6_ADDR_SCOPE_GLOBAL 0x0e /* * Addr flags */ #define IPV6_ADDR_MC_FLAG_TRANSIENT(a) \ ((a)->s6_addr[1] & 0x10) #define IPV6_ADDR_MC_FLAG_PREFIX(a) \ ((a)->s6_addr[1] & 0x20) #define IPV6_ADDR_MC_FLAG_RENDEZVOUS(a) \ ((a)->s6_addr[1] & 0x40) /* * fragmentation header */ struct frag_hdr { __u8 nexthdr; __u8 reserved; __be16 frag_off; __be32 identification; }; #define IP6_MF 0x0001 #define IP6_OFFSET 0xFFF8 struct ip6_fraglist_iter { struct ipv6hdr *tmp_hdr; struct sk_buff *frag; int offset; unsigned int hlen; __be32 frag_id; u8 nexthdr; }; int ip6_fraglist_init(struct sk_buff *skb, unsigned int hlen, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_fraglist_iter *iter); void ip6_fraglist_prepare(struct sk_buff *skb, struct ip6_fraglist_iter *iter); static inline struct sk_buff *ip6_fraglist_next(struct ip6_fraglist_iter *iter) { struct sk_buff *skb = iter->frag; iter->frag = skb->next; skb_mark_not_on_list(skb); return skb; } struct ip6_frag_state { u8 *prevhdr; unsigned int hlen; unsigned int mtu; unsigned int left; int offset; int ptr; int hroom; int troom; __be32 frag_id; u8 nexthdr; }; void ip6_frag_init(struct sk_buff *skb, unsigned int hlen, unsigned int mtu, unsigned short needed_tailroom, int hdr_room, u8 *prevhdr, u8 nexthdr, __be32 frag_id, struct ip6_frag_state *state); struct sk_buff *ip6_frag_next(struct sk_buff *skb, struct ip6_frag_state *state); #define IP6_REPLY_MARK(net, mark) \ ((net)->ipv6.sysctl.fwmark_reflect ? (mark) : 0) #include <net/sock.h> /* sysctls */ extern int sysctl_mld_max_msf; extern int sysctl_mld_qrv; #define _DEVINC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_INC_STATS64((_idev)->stats.statname, (field));\ mod##SNMP_INC_STATS64((net)->mib.statname##_statistics, (field));\ }) /* per device counters are atomic_long_t */ #define _DEVINCATOMIC(net, statname, mod, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ mod##SNMP_INC_STATS((net)->mib.statname##_statistics, (field));\ }) /* per device and per net counters are atomic_long_t */ #define _DEVINC_ATOMIC_ATOMIC(net, statname, idev, field) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ SNMP_INC_STATS_ATOMIC_LONG((_idev)->stats.statname##dev, (field)); \ SNMP_INC_STATS_ATOMIC_LONG((net)->mib.statname##_statistics, (field));\ }) #define _DEVADD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_ADD_STATS((_idev)->stats.statname, (field), (val)); \ mod##SNMP_ADD_STATS((net)->mib.statname##_statistics, (field), (val));\ }) #define _DEVUPD(net, statname, mod, idev, field, val) \ ({ \ struct inet6_dev *_idev = (idev); \ if (likely(_idev != NULL)) \ mod##SNMP_UPD_PO_STATS((_idev)->stats.statname, field, (val)); \ mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\ }) /* MIBs */ #define IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, , idev, field) #define __IP6_INC_STATS(net, idev,field) \ _DEVINC(net, ipv6, __, idev, field) #define IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, , idev, field, val) #define __IP6_ADD_STATS(net, idev,field,val) \ _DEVADD(net, ipv6, __, idev, field, val) #define IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, , idev, field, val) #define __IP6_UPD_PO_STATS(net, idev,field,val) \ _DEVUPD(net, ipv6, __, idev, field, val) #define ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, , idev, field) #define __ICMP6_INC_STATS(net, idev, field) \ _DEVINCATOMIC(net, icmpv6, __, idev, field) #define ICMP6MSGOUT_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field +256) #define ICMP6MSGIN_INC_STATS(net, idev, field) \ _DEVINC_ATOMIC_ATOMIC(net, icmpv6msg, idev, field) struct ip6_ra_chain { struct ip6_ra_chain *next; struct sock *sk; int sel; void (*destructor)(struct sock *); }; extern struct ip6_ra_chain *ip6_ra_chain; extern rwlock_t ip6_ra_lock; /* This structure is prepared by protocol, when parsing ancillary data and passed to IPv6. */ struct ipv6_txoptions { refcount_t refcnt; /* Length of this structure */ int tot_len; /* length of extension headers */ __u16 opt_flen; /* after fragment hdr */ __u16 opt_nflen; /* before fragment hdr */ struct ipv6_opt_hdr *hopopt; struct ipv6_opt_hdr *dst0opt; struct ipv6_rt_hdr *srcrt; /* Routing Header */ struct ipv6_opt_hdr *dst1opt; struct rcu_head rcu; /* Option buffer, as read by IPV6_PKTOPTIONS, starts here. */ }; /* flowlabel_reflect sysctl values */ enum flowlabel_reflect { FLOWLABEL_REFLECT_ESTABLISHED = 1, FLOWLABEL_REFLECT_TCP_RESET = 2, FLOWLABEL_REFLECT_ICMPV6_ECHO_REPLIES = 4, }; struct ip6_flowlabel { struct ip6_flowlabel __rcu *next; __be32 label; atomic_t users; struct in6_addr dst; struct ipv6_txoptions *opt; unsigned long linger; struct rcu_head rcu; u8 share; union { struct pid *pid; kuid_t uid; } owner; unsigned long lastuse; unsigned long expires; struct net *fl_net; }; #define IPV6_FLOWINFO_MASK cpu_to_be32(0x0FFFFFFF) #define IPV6_FLOWLABEL_MASK cpu_to_be32(0x000FFFFF) #define IPV6_FLOWLABEL_STATELESS_FLAG cpu_to_be32(0x00080000) #define IPV6_TCLASS_MASK (IPV6_FLOWINFO_MASK & ~IPV6_FLOWLABEL_MASK) #define IPV6_TCLASS_SHIFT 20 struct ipv6_fl_socklist { struct ipv6_fl_socklist __rcu *next; struct ip6_flowlabel *fl; struct rcu_head rcu; }; struct ipcm6_cookie { struct sockcm_cookie sockc; __s16 hlimit; __s16 tclass; __s8 dontfrag; struct ipv6_txoptions *opt; __u16 gso_size; }; static inline void ipcm6_init(struct ipcm6_cookie *ipc6) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = -1, .dontfrag = -1, }; } static inline void ipcm6_init_sk(struct ipcm6_cookie *ipc6, const struct ipv6_pinfo *np) { *ipc6 = (struct ipcm6_cookie) { .hlimit = -1, .tclass = np->tclass, .dontfrag = np->dontfrag, }; } static inline struct ipv6_txoptions *txopt_get(const struct ipv6_pinfo *np) { struct ipv6_txoptions *opt; rcu_read_lock(); opt = rcu_dereference(np->opt); if (opt) { if (!refcount_inc_not_zero(&opt->refcnt)) opt = NULL; else opt = rcu_pointer_handoff(opt); } rcu_read_unlock(); return opt; } static inline void txopt_put(struct ipv6_txoptions *opt) { if (opt && refcount_dec_and_test(&opt->refcnt)) kfree_rcu(opt, rcu); } struct ip6_flowlabel *__fl6_sock_lookup(struct sock *sk, __be32 label); extern struct static_key_false_deferred ipv6_flowlabel_exclusive; static inline struct ip6_flowlabel *fl6_sock_lookup(struct sock *sk, __be32 label) { if (static_branch_unlikely(&ipv6_flowlabel_exclusive.key)) return __fl6_sock_lookup(sk, label) ? : ERR_PTR(-ENOENT); return NULL; } struct ipv6_txoptions *fl6_merge_options(struct ipv6_txoptions *opt_space, struct ip6_flowlabel *fl, struct ipv6_txoptions *fopt); void fl6_free_socklist(struct sock *sk); int ipv6_flowlabel_opt(struct sock *sk, sockptr_t optval, int optlen); int ipv6_flowlabel_opt_get(struct sock *sk, struct in6_flowlabel_req *freq, int flags); int ip6_flowlabel_init(void); void ip6_flowlabel_cleanup(void); bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np); static inline void fl6_sock_release(struct ip6_flowlabel *fl) { if (fl) atomic_dec(&fl->users); } void icmpv6_notify(struct sk_buff *skb, u8 type, u8 code, __be32 info); void icmpv6_push_pending_frames(struct sock *sk, struct flowi6 *fl6, struct icmp6hdr *thdr, int len); int ip6_ra_control(struct sock *sk, int sel); int ipv6_parse_hopopts(struct sk_buff *skb); struct ipv6_txoptions *ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt); struct ipv6_txoptions *ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt, int newtype, struct ipv6_opt_hdr *newopt); struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb, const struct inet6_skb_parm *opt); struct ipv6_txoptions *ipv6_update_options(struct sock *sk, struct ipv6_txoptions *opt); static inline bool ipv6_accept_ra(struct inet6_dev *idev) { /* If forwarding is enabled, RA are not accepted unless the special * hybrid mode (accept_ra=2) is enabled. */ return idev->cnf.forwarding ? idev->cnf.accept_ra == 2 : idev->cnf.accept_ra; } #define IPV6_FRAG_HIGH_THRESH (4 * 1024*1024) /* 4194304 */ #define IPV6_FRAG_LOW_THRESH (3 * 1024*1024) /* 3145728 */ #define IPV6_FRAG_TIMEOUT (60 * HZ) /* 60 seconds */ int __ipv6_addr_type(const struct in6_addr *addr); static inline int ipv6_addr_type(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & 0xffff; } static inline int ipv6_addr_scope(const struct in6_addr *addr) { return __ipv6_addr_type(addr) & IPV6_ADDR_SCOPE_MASK; } static inline int __ipv6_addr_src_scope(int type) { return (type == IPV6_ADDR_ANY) ? __IPV6_ADDR_SCOPE_INVALID : (type >> 16); } static inline int ipv6_addr_src_scope(const struct in6_addr *addr) { return __ipv6_addr_src_scope(__ipv6_addr_type(addr)); } static inline bool __ipv6_addr_needs_scope_id(int type) { return type & IPV6_ADDR_LINKLOCAL || (type & IPV6_ADDR_MULTICAST && (type & (IPV6_ADDR_LOOPBACK|IPV6_ADDR_LINKLOCAL))); } static inline __u32 ipv6_iface_scope_id(const struct in6_addr *addr, int iface) { return __ipv6_addr_needs_scope_id(__ipv6_addr_type(addr)) ? iface : 0; } static inline int ipv6_addr_cmp(const struct in6_addr *a1, const struct in6_addr *a2) { return memcmp(a1, a2, sizeof(struct in6_addr)); } static inline bool ipv6_masked_addr_cmp(const struct in6_addr *a1, const struct in6_addr *m, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ulm = (const unsigned long *)m; const unsigned long *ul2 = (const unsigned long *)a2; return !!(((ul1[0] ^ ul2[0]) & ulm[0]) | ((ul1[1] ^ ul2[1]) & ulm[1])); #else return !!(((a1->s6_addr32[0] ^ a2->s6_addr32[0]) & m->s6_addr32[0]) | ((a1->s6_addr32[1] ^ a2->s6_addr32[1]) & m->s6_addr32[1]) | ((a1->s6_addr32[2] ^ a2->s6_addr32[2]) & m->s6_addr32[2]) | ((a1->s6_addr32[3] ^ a2->s6_addr32[3]) & m->s6_addr32[3])); #endif } static inline void ipv6_addr_prefix(struct in6_addr *pfx, const struct in6_addr *addr, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memset(pfx->s6_addr, 0, sizeof(pfx->s6_addr)); memcpy(pfx->s6_addr, addr, o); if (b != 0) pfx->s6_addr[o] = addr->s6_addr[o] & (0xff00 >> b); } static inline void ipv6_addr_prefix_copy(struct in6_addr *addr, const struct in6_addr *pfx, int plen) { /* caller must guarantee 0 <= plen <= 128 */ int o = plen >> 3, b = plen & 0x7; memcpy(addr->s6_addr, pfx, o); if (b != 0) { addr->s6_addr[o] &= ~(0xff00 >> b); addr->s6_addr[o] |= (pfx->s6_addr[o] & (0xff00 >> b)); } } static inline void __ipv6_addr_set_half(__be32 *addr, __be32 wh, __be32 wl) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 #if defined(__BIG_ENDIAN) if (__builtin_constant_p(wh) && __builtin_constant_p(wl)) { *(__force u64 *)addr = ((__force u64)(wh) << 32 | (__force u64)(wl)); return; } #elif defined(__LITTLE_ENDIAN) if (__builtin_constant_p(wl) && __builtin_constant_p(wh)) { *(__force u64 *)addr = ((__force u64)(wl) << 32 | (__force u64)(wh)); return; } #endif #endif addr[0] = wh; addr[1] = wl; } static inline void ipv6_addr_set(struct in6_addr *addr, __be32 w1, __be32 w2, __be32 w3, __be32 w4) { __ipv6_addr_set_half(&addr->s6_addr32[0], w1, w2); __ipv6_addr_set_half(&addr->s6_addr32[2], w3, w4); } static inline bool ipv6_addr_equal(const struct in6_addr *a1, const struct in6_addr *a2) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul1 = (const unsigned long *)a1; const unsigned long *ul2 = (const unsigned long *)a2; return ((ul1[0] ^ ul2[0]) | (ul1[1] ^ ul2[1])) == 0UL; #else return ((a1->s6_addr32[0] ^ a2->s6_addr32[0]) | (a1->s6_addr32[1] ^ a2->s6_addr32[1]) | (a1->s6_addr32[2] ^ a2->s6_addr32[2]) | (a1->s6_addr32[3] ^ a2->s6_addr32[3])) == 0; #endif } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline bool __ipv6_prefix_equal64_half(const __be64 *a1, const __be64 *a2, unsigned int len) { if (len && ((*a1 ^ *a2) & cpu_to_be64((~0UL) << (64 - len)))) return false; return true; } static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be64 *a1 = (const __be64 *)addr1; const __be64 *a2 = (const __be64 *)addr2; if (prefixlen >= 64) { if (a1[0] ^ a2[0]) return false; return __ipv6_prefix_equal64_half(a1 + 1, a2 + 1, prefixlen - 64); } return __ipv6_prefix_equal64_half(a1, a2, prefixlen); } #else static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, const struct in6_addr *addr2, unsigned int prefixlen) { const __be32 *a1 = addr1->s6_addr32; const __be32 *a2 = addr2->s6_addr32; unsigned int pdw, pbi; /* check complete u32 in prefix */ pdw = prefixlen >> 5; if (pdw && memcmp(a1, a2, pdw << 2)) return false; /* check incomplete u32 in prefix */ pbi = prefixlen & 0x1f; if (pbi && ((a1[pdw] ^ a2[pdw]) & htonl((0xffffffff) << (32 - pbi)))) return false; return true; } #endif static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; return (ul[0] | ul[1]) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | a->s6_addr32[3]) == 0; #endif } static inline u32 ipv6_addr_hash(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const unsigned long *ul = (const unsigned long *)a; unsigned long x = ul[0] ^ ul[1]; return (u32)(x ^ (x >> 32)); #else return (__force u32)(a->s6_addr32[0] ^ a->s6_addr32[1] ^ a->s6_addr32[2] ^ a->s6_addr32[3]); #endif } /* more secured version of ipv6_addr_hash() */ static inline u32 __ipv6_addr_jhash(const struct in6_addr *a, const u32 initval) { u32 v = (__force u32)a->s6_addr32[0] ^ (__force u32)a->s6_addr32[1]; return jhash_3words(v, (__force u32)a->s6_addr32[2], (__force u32)a->s6_addr32[3], initval); } static inline bool ipv6_addr_loopback(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 const __be64 *be = (const __be64 *)a; return (be[0] | (be[1] ^ cpu_to_be64(1))) == 0UL; #else return (a->s6_addr32[0] | a->s6_addr32[1] | a->s6_addr32[2] | (a->s6_addr32[3] ^ cpu_to_be32(1))) == 0; #endif } /* * Note that we must __force cast these to unsigned long to make sparse happy, * since all of the endian-annotated types are fixed size regardless of arch. */ static inline bool ipv6_addr_v4mapped(const struct in6_addr *a) { return ( #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 *(unsigned long *)a | #else (__force unsigned long)(a->s6_addr32[0] | a->s6_addr32[1]) | #endif (__force unsigned long)(a->s6_addr32[2] ^ cpu_to_be32(0x0000ffff))) == 0UL; } static inline bool ipv6_addr_v4mapped_loopback(const struct in6_addr *a) { return ipv6_addr_v4mapped(a) && ipv4_is_loopback(a->s6_addr32[3]); } static inline u32 ipv6_portaddr_hash(const struct net *net, const struct in6_addr *addr6, unsigned int port) { unsigned int hash, mix = net_hash_mix(net); if (ipv6_addr_any(addr6)) hash = jhash_1word(0, mix); else if (ipv6_addr_v4mapped(addr6)) hash = jhash_1word((__force u32)addr6->s6_addr32[3], mix); else hash = jhash2((__force u32 *)addr6->s6_addr32, 4, mix); return hash ^ port; } /* * Check for a RFC 4843 ORCHID address * (Overlay Routable Cryptographic Hash Identifiers) */ static inline bool ipv6_addr_orchid(const struct in6_addr *a) { return (a->s6_addr32[0] & htonl(0xfffffff0)) == htonl(0x20010010); } static inline bool ipv6_addr_is_multicast(const struct in6_addr *addr) { return (addr->s6_addr32[0] & htonl(0xFF000000)) == htonl(0xFF000000); } static inline void ipv6_addr_set_v4mapped(const __be32 addr, struct in6_addr *v4mapped) { ipv6_addr_set(v4mapped, 0, 0, htonl(0x0000FFFF), addr); } /* * find the first different bit between two addresses * length of address must be a multiple of 32bits */ static inline int __ipv6_addr_diff32(const void *token1, const void *token2, int addrlen) { const __be32 *a1 = token1, *a2 = token2; int i; addrlen >>= 2; for (i = 0; i < addrlen; i++) { __be32 xb = a1[i] ^ a2[i]; if (xb) return i * 32 + 31 - __fls(ntohl(xb)); } /* * we should *never* get to this point since that * would mean the addrs are equal * * However, we do get to it 8) And exacly, when * addresses are equal 8) * * ip route add 1111::/128 via ... * ip route add 1111::/64 via ... * and we are here. * * Ideally, this function should stop comparison * at prefix length. It does not, but it is still OK, * if returned value is greater than prefix length. * --ANK (980803) */ return addrlen << 5; } #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 static inline int __ipv6_addr_diff64(const void *token1, const void *token2, int addrlen) { const __be64 *a1 = token1, *a2 = token2; int i; addrlen >>= 3; for (i = 0; i < addrlen; i++) { __be64 xb = a1[i] ^ a2[i]; if (xb) return i * 64 + 63 - __fls(be64_to_cpu(xb)); } return addrlen << 6; } #endif static inline int __ipv6_addr_diff(const void *token1, const void *token2, int addrlen) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 if (__builtin_constant_p(addrlen) && !(addrlen & 7)) return __ipv6_addr_diff64(token1, token2, addrlen); #endif return __ipv6_addr_diff32(token1, token2, addrlen); } static inline int ipv6_addr_diff(const struct in6_addr *a1, const struct in6_addr *a2) { return __ipv6_addr_diff(a1, a2, sizeof(struct in6_addr)); } __be32 ipv6_select_ident(struct net *net, const struct in6_addr *daddr, const struct in6_addr *saddr); __be32 ipv6_proxy_select_ident(struct net *net, struct sk_buff *skb); int ip6_dst_hoplimit(struct dst_entry *dst); static inline int ip6_sk_dst_hoplimit(struct ipv6_pinfo *np, struct flowi6 *fl6, struct dst_entry *dst) { int hlimit; if (ipv6_addr_is_multicast(&fl6->daddr)) hlimit = np->mcast_hops; else hlimit = np->hop_limit; if (hlimit < 0) hlimit = ip6_dst_hoplimit(dst); return hlimit; } /* copy IPv6 saddr & daddr to flow_keys, possibly using 64bit load/store * Equivalent to : flow->v6addrs.src = iph->saddr; * flow->v6addrs.dst = iph->daddr; */ static inline void iph_to_flow_copy_v6addrs(struct flow_keys *flow, const struct ipv6hdr *iph) { BUILD_BUG_ON(offsetof(typeof(flow->addrs), v6addrs.dst) != offsetof(typeof(flow->addrs), v6addrs.src) + sizeof(flow->addrs.v6addrs.src)); memcpy(&flow->addrs.v6addrs, &iph->saddr, sizeof(flow->addrs.v6addrs)); flow->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } #if IS_ENABLED(CONFIG_IPV6) static inline bool ipv6_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv6.sysctl.ip_nonlocal_bind || inet->freebind || inet->transparent; } /* Sysctl settings for net ipv6.auto_flowlabels */ #define IP6_AUTO_FLOW_LABEL_OFF 0 #define IP6_AUTO_FLOW_LABEL_OPTOUT 1 #define IP6_AUTO_FLOW_LABEL_OPTIN 2 #define IP6_AUTO_FLOW_LABEL_FORCED 3 #define IP6_AUTO_FLOW_LABEL_MAX IP6_AUTO_FLOW_LABEL_FORCED #define IP6_DEFAULT_AUTO_FLOW_LABELS IP6_AUTO_FLOW_LABEL_OPTOUT static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { u32 hash; /* @flowlabel may include more than a flow label, eg, the traffic class. * Here we want only the flow label value. */ flowlabel &= IPV6_FLOWLABEL_MASK; if (flowlabel || net->ipv6.sysctl.auto_flowlabels == IP6_AUTO_FLOW_LABEL_OFF || (!autolabel && net->ipv6.sysctl.auto_flowlabels != IP6_AUTO_FLOW_LABEL_FORCED)) return flowlabel; hash = skb_get_hash_flowi6(skb, fl6); /* Since this is being sent on the wire obfuscate hash a bit * to minimize possbility that any useful information to an * attacker is leaked. Only lower 20 bits are relevant. */ hash = rol32(hash, 16); flowlabel = (__force __be32)hash & IPV6_FLOWLABEL_MASK; if (net->ipv6.sysctl.flowlabel_state_ranges) flowlabel |= IPV6_FLOWLABEL_STATELESS_FLAG; return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { switch (net->ipv6.sysctl.auto_flowlabels) { case IP6_AUTO_FLOW_LABEL_OFF: case IP6_AUTO_FLOW_LABEL_OPTIN: default: return 0; case IP6_AUTO_FLOW_LABEL_OPTOUT: case IP6_AUTO_FLOW_LABEL_FORCED: return 1; } } #else static inline __be32 ip6_make_flowlabel(struct net *net, struct sk_buff *skb, __be32 flowlabel, bool autolabel, struct flowi6 *fl6) { return flowlabel; } static inline int ip6_default_np_autolabel(struct net *net) { return 0; } #endif #if IS_ENABLED(CONFIG_IPV6) static inline int ip6_multipath_hash_policy(const struct net *net) { return net->ipv6.sysctl.multipath_hash_policy; } #else static inline int ip6_multipath_hash_policy(const struct net *net) { return 0; } #endif /* * Header manipulation */ static inline void ip6_flow_hdr(struct ipv6hdr *hdr, unsigned int tclass, __be32 flowlabel) { *(__be32 *)hdr = htonl(0x60000000 | (tclass << 20)) | flowlabel; } static inline __be32 ip6_flowinfo(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWINFO_MASK; } static inline __be32 ip6_flowlabel(const struct ipv6hdr *hdr) { return *(__be32 *)hdr & IPV6_FLOWLABEL_MASK; } static inline u8 ip6_tclass(__be32 flowinfo) { return ntohl(flowinfo & IPV6_TCLASS_MASK) >> IPV6_TCLASS_SHIFT; } static inline __be32 ip6_make_flowinfo(unsigned int tclass, __be32 flowlabel) { return htonl(tclass << IPV6_TCLASS_SHIFT) | flowlabel; } static inline __be32 flowi6_get_flowlabel(const struct flowi6 *fl6) { return fl6->flowlabel & IPV6_FLOWLABEL_MASK; } /* * Prototypes exported by ipv6 */ /* * rcv function (called from netdevice level) */ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev); void ipv6_list_rcv(struct list_head *head, struct packet_type *pt, struct net_device *orig_dev); int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb); /* * upper-layer output functions */ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6, __u32 mark, struct ipv6_txoptions *opt, int tclass, u32 priority); int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr); int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags); int ip6_push_pending_frames(struct sock *sk); void ip6_flush_pending_frames(struct sock *sk); int ip6_send_skb(struct sk_buff *skb); struct sk_buff *__ip6_make_skb(struct sock *sk, struct sk_buff_head *queue, struct inet_cork_full *cork, struct inet6_cork *v6_cork); struct sk_buff *ip6_make_skb(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, int length, int transhdrlen, struct ipcm6_cookie *ipc6, struct flowi6 *fl6, struct rt6_info *rt, unsigned int flags, struct inet_cork_full *cork); static inline struct sk_buff *ip6_finish_skb(struct sock *sk) { return __ip6_make_skb(sk, &sk->sk_write_queue, &inet_sk(sk)->cork, &inet6_sk(sk)->cork); } int ip6_dst_lookup(struct net *net, struct sock *sk, struct dst_entry **dst, struct flowi6 *fl6); struct dst_entry *ip6_dst_lookup_flow(struct net *net, const struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst); struct dst_entry *ip6_sk_dst_lookup_flow(struct sock *sk, struct flowi6 *fl6, const struct in6_addr *final_dst, bool connected); struct dst_entry *ip6_dst_lookup_tunnel(struct sk_buff *skb, struct net_device *dev, struct net *net, struct socket *sock, struct in6_addr *saddr, const struct ip_tunnel_info *info, u8 protocol, bool use_cache); struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *orig_dst); /* * skb processing functions */ int ip6_output(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_forward(struct sk_buff *skb); int ip6_input(struct sk_buff *skb); int ip6_mc_input(struct sk_buff *skb); void ip6_protocol_deliver_rcu(struct net *net, struct sk_buff *skb, int nexthdr, bool have_final); int __ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); int ip6_local_out(struct net *net, struct sock *sk, struct sk_buff *skb); /* * Extension header (options) processing */ void ipv6_push_nfrag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto, struct in6_addr **daddr_p, struct in6_addr *saddr); void ipv6_push_frag_opts(struct sk_buff *skb, struct ipv6_txoptions *opt, u8 *proto); int ipv6_skip_exthdr(const struct sk_buff *, int start, u8 *nexthdrp, __be16 *frag_offp); bool ipv6_ext_hdr(u8 nexthdr); enum { IP6_FH_F_FRAG = (1 << 0), IP6_FH_F_AUTH = (1 << 1), IP6_FH_F_SKIP_RH = (1 << 2), }; /* find specified header and get offset to it */ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset, int target, unsigned short *fragoff, int *fragflg); int ipv6_find_tlv(const struct sk_buff *skb, int offset, int type); struct in6_addr *fl6_update_dst(struct flowi6 *fl6, const struct ipv6_txoptions *opt, struct in6_addr *orig); /* * socket options (ipv6_sockglue.c) */ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int ipv6_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int __ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_connect_v6_only(struct sock *sk, struct sockaddr *addr, int addr_len); int ip6_datagram_dst_update(struct sock *sk, bool fix_sk_saddr); void ip6_datagram_release_cb(struct sock *sk); int ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); int ipv6_recv_rxpmtu(struct sock *sk, struct msghdr *msg, int len, int *addr_len); void ipv6_icmp_error(struct sock *sk, struct sk_buff *skb, int err, __be16 port, u32 info, u8 *payload); void ipv6_local_error(struct sock *sk, int err, struct flowi6 *fl6, u32 info); void ipv6_local_rxpmtu(struct sock *sk, struct flowi6 *fl6, u32 mtu); int inet6_release(struct socket *sock); int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet6_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); int inet6_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); /* * reassembly.c */ extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; extern const struct proto_ops inet6_sockraw_ops; struct group_source_req; struct group_filter; int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage *list); int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, struct sockaddr_storage __user *p); #ifdef CONFIG_PROC_FS int ac6_proc_init(struct net *net); void ac6_proc_exit(struct net *net); int raw6_proc_init(void); void raw6_proc_exit(void); int tcp6_proc_init(struct net *net); void tcp6_proc_exit(struct net *net); int udp6_proc_init(struct net *net); void udp6_proc_exit(struct net *net); int udplite6_proc_init(void); void udplite6_proc_exit(void); int ipv6_misc_proc_init(void); void ipv6_misc_proc_exit(void); int snmp6_register_dev(struct inet6_dev *idev); int snmp6_unregister_dev(struct inet6_dev *idev); #else static inline int ac6_proc_init(struct net *net) { return 0; } static inline void ac6_proc_exit(struct net *net) { } static inline int snmp6_register_dev(struct inet6_dev *idev) { return 0; } static inline int snmp6_unregister_dev(struct inet6_dev *idev) { return 0; } #endif #ifdef CONFIG_SYSCTL struct ctl_table *ipv6_icmp_sysctl_init(struct net *net); struct ctl_table *ipv6_route_sysctl_init(struct net *net); int ipv6_sysctl_register(void); void ipv6_sysctl_unregister(void); #endif int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr); int ipv6_sock_mc_join_ssm(struct sock *sk, int ifindex, const struct in6_addr *addr, unsigned int mode); int ipv6_sock_mc_drop(struct sock *sk, int ifindex, const struct in6_addr *addr); static inline int ip6_sock_set_v6only(struct sock *sk) { if (inet_sk(sk)->inet_num) return -EINVAL; lock_sock(sk); sk->sk_ipv6only = true; release_sock(sk); return 0; } static inline void ip6_sock_set_recverr(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->recverr = true; release_sock(sk); } static inline int __ip6_sock_set_addr_preferences(struct sock *sk, int val) { unsigned int pref = 0; unsigned int prefmask = ~0; /* check PUBLIC/TMP/PUBTMP_DEFAULT conflicts */ switch (val & (IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP | IPV6_PREFER_SRC_PUBTMP_DEFAULT)) { case IPV6_PREFER_SRC_PUBLIC: pref |= IPV6_PREFER_SRC_PUBLIC; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_TMP: pref |= IPV6_PREFER_SRC_TMP; prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case IPV6_PREFER_SRC_PUBTMP_DEFAULT: prefmask &= ~(IPV6_PREFER_SRC_PUBLIC | IPV6_PREFER_SRC_TMP); break; case 0: break; default: return -EINVAL; } /* check HOME/COA conflicts */ switch (val & (IPV6_PREFER_SRC_HOME | IPV6_PREFER_SRC_COA)) { case IPV6_PREFER_SRC_HOME: prefmask &= ~IPV6_PREFER_SRC_COA; break; case IPV6_PREFER_SRC_COA: pref |= IPV6_PREFER_SRC_COA; break; case 0: break; default: return -EINVAL; } /* check CGA/NONCGA conflicts */ switch (val & (IPV6_PREFER_SRC_CGA|IPV6_PREFER_SRC_NONCGA)) { case IPV6_PREFER_SRC_CGA: case IPV6_PREFER_SRC_NONCGA: case 0: break; default: return -EINVAL; } inet6_sk(sk)->srcprefs = (inet6_sk(sk)->srcprefs & prefmask) | pref; return 0; } static inline int ip6_sock_set_addr_preferences(struct sock *sk, bool val) { int ret; lock_sock(sk); ret = __ip6_sock_set_addr_preferences(sk, val); release_sock(sk); return ret; } static inline void ip6_sock_set_recvpktinfo(struct sock *sk) { lock_sock(sk); inet6_sk(sk)->rxopt.bits.rxinfo = true; release_sock(sk); } #endif /* _NET_IPV6_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Universal TUN/TAP device driver. * Copyright (C) 1999-2000 Maxim Krasnyansky <max_mk@yahoo.com> */ #ifndef __IF_TUN_H #define __IF_TUN_H #include <uapi/linux/if_tun.h> #include <uapi/linux/virtio_net.h> #define TUN_XDP_FLAG 0x1UL #define TUN_MSG_UBUF 1 #define TUN_MSG_PTR 2 struct tun_msg_ctl { unsigned short type; unsigned short num; void *ptr; }; struct tun_xdp_hdr { int buflen; struct virtio_net_hdr gso; }; #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE) struct socket *tun_get_socket(struct file *); struct ptr_ring *tun_get_tx_ring(struct file *file); static inline bool tun_is_xdp_frame(void *ptr) { return (unsigned long)ptr & TUN_XDP_FLAG; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return (void *)((unsigned long)xdp | TUN_XDP_FLAG); } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return (void *)((unsigned long)ptr & ~TUN_XDP_FLAG); } void tun_ptr_free(void *ptr); #else #include <linux/err.h> #include <linux/errno.h> struct file; struct socket; static inline struct socket *tun_get_socket(struct file *f) { return ERR_PTR(-EINVAL); } static inline struct ptr_ring *tun_get_tx_ring(struct file *f) { return ERR_PTR(-EINVAL); } static inline bool tun_is_xdp_frame(void *ptr) { return false; } static inline void *tun_xdp_to_ptr(struct xdp_frame *xdp) { return NULL; } static inline struct xdp_frame *tun_ptr_to_xdp(void *ptr) { return NULL; } static inline void tun_ptr_free(void *ptr) { } #endif /* CONFIG_TUN */ #endif /* __IF_TUN_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for diskquota-operations. When diskquota is configured these * macros expand to the right source-code. * * Author: Marco van Wieringen <mvw@planets.elm.net> */ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ #include <linux/fs.h> #define DQUOT_SPACE_WARN 0x1 #define DQUOT_SPACE_RESERVE 0x2 #define DQUOT_SPACE_NOFAIL 0x4 static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } /* i_mutex must being held */ static inline bool is_quota_modification(struct inode *inode, struct iattr *ia) { return (ia->ia_valid & ATTR_SIZE) || (ia->ia_valid & ATTR_UID && !uid_eq(ia->ia_uid, inode->i_uid)) || (ia->ia_valid & ATTR_GID && !gid_eq(ia->ia_gid, inode->i_gid)); } #if defined(CONFIG_QUOTA) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); /* * declaration of quota_function calls in kernel. */ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) { /* Make sure someone else has active reference to dquot */ WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); atomic_inc(&dquot->dq_count); return dquot; } static inline bool dquot_is_busy(struct dquot *dquot) { if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return true; if (atomic_read(&dquot->dq_count) > 1) return true; return false; } void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ static inline int dquot_suspend(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_SUSPENDED); } int dquot_resume(struct super_block *sb, int type); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_get_next_id(struct super_block *sb, struct kqid *qid); int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); int dquot_writeback_dquots(struct super_block *sb, int type); int dquot_quota_sync(struct super_block *sb, int type); int dquot_get_state(struct super_block *sb, struct qc_state *state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii); int dquot_get_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id, struct qc_dqblk *di); int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); int dquot_transfer(struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { return sb_dqopt(sb)->info + type; } /* * Functions for checking status of quota */ static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } static inline unsigned sb_any_quota_suspended(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED); } /* Does kernel know about any quota information for given sb + type? */ static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } static inline unsigned sb_any_quota_loaded(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED); } static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } /* * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_suspended(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_suspended(struct super_block *sb) { return 0; } /* Does kernel know about any quota information for given sb + type? */ static inline int sb_has_quota_loaded(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_loaded(struct super_block *sb) { return 0; } static inline int sb_has_quota_active(struct super_block *sb, int type) { return 0; } static inline int dquot_initialize(struct inode *inode) { return 0; } static inline bool dquot_initialize_needed(struct inode *inode) { return false; } static inline void dquot_drop(struct inode *inode) { } static inline int dquot_alloc_inode(struct inode *inode) { return 0; } static inline void dquot_free_inode(struct inode *inode) { } static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) { return 0; } static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_add_bytes(inode, number); return 0; } static inline void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_sub_bytes(inode, number); } static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); return 0; } static inline int dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { inode_sub_bytes(inode, number); return 0; } static inline int dquot_disable(struct super_block *sb, int type, unsigned int flags) { return 0; } static inline int dquot_suspend(struct super_block *sb, int type) { return 0; } static inline int dquot_resume(struct super_block *sb, int type) { return 0; } #define dquot_file_open generic_file_open static inline int dquot_writeback_dquots(struct super_block *sb, int type) { return 0; } #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN); } static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr) { __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL); mark_inode_dirty_sync(inode); } static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { int ret; ret = dquot_alloc_space_nodirty(inode, nr); if (!ret) { /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly * reduces lock contention. */ mark_inode_dirty(inode); } return ret; } static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr) { dquot_alloc_space_nofail(inode, nr << inode->i_blkbits); } static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { return dquot_alloc_space(inode, nr << inode->i_blkbits); } static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0); } static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_prealloc_block_nodirty(inode, nr); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } static inline int dquot_claim_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) { dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr, 0); } static inline void dquot_free_space(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr); mark_inode_dirty_sync(inode); } static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_free_block(struct inode *inode, qsize_t nr) { dquot_free_space(inode, nr << inode->i_blkbits); } static inline void dquot_release_reservation_block(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE); } unsigned int qtype_enforce_flag(int type); #endif /* _LINUX_QUOTAOPS_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * include/linux/eventpoll.h ( Efficient event polling implementation ) * Copyright (C) 2001,...,2006 Davide Libenzi * * Davide Libenzi <davidel@xmailserver.org> */ #ifndef _LINUX_EVENTPOLL_H #define _LINUX_EVENTPOLL_H #include <uapi/linux/eventpoll.h> #include <uapi/linux/kcmp.h> /* Forward declarations to avoid compiler errors */ struct file; #ifdef CONFIG_EPOLL #ifdef CONFIG_KCMP struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, unsigned long toff); #endif /* Used to initialize the epoll bits inside the "struct file" */ static inline void eventpoll_init_file(struct file *file) { INIT_LIST_HEAD(&file->f_ep_links); INIT_LIST_HEAD(&file->f_tfile_llink); } /* Used to release the epoll bits inside the "struct file" */ void eventpoll_release_file(struct file *file); /* * This is called from inside fs/file_table.c:__fput() to unlink files * from the eventpoll interface. We need to have this facility to cleanup * correctly files that are closed without being removed from the eventpoll * interface. */ static inline void eventpoll_release(struct file *file) { /* * Fast check to avoid the get/release of the semaphore. Since * we're doing this outside the semaphore lock, it might return * false negatives, but we don't care. It'll help in 99.99% of cases * to avoid the semaphore lock. False positives simply cannot happen * because the file in on the way to be removed and nobody ( but * eventpoll ) has still a reference to this file. */ if (likely(list_empty(&file->f_ep_links))) return; /* * The file is being closed while it is still linked to an epoll * descriptor. We need to handle this by correctly unlinking it * from its containers. */ eventpoll_release_file(file); } int do_epoll_ctl(int epfd, int op, int fd, struct epoll_event *epds, bool nonblock); /* Tells if the epoll_ctl(2) operation needs an event copy from userspace */ static inline int ep_op_has_event(int op) { return op != EPOLL_CTL_DEL; } #else static inline void eventpoll_init_file(struct file *file) {} static inline void eventpoll_release(struct file *file) {} #endif #endif /* #ifndef _LINUX_EVENTPOLL_H */
1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 // SPDX-License-Identifier: GPL-2.0 /* * Copyright(C) 2005-2006, Thomas Gleixner <tglx@linutronix.de> * Copyright(C) 2005-2007, Red Hat, Inc., Ingo Molnar * Copyright(C) 2006-2007 Timesys Corp., Thomas Gleixner * * High-resolution kernel timers * * In contrast to the low-resolution timeout API, aka timer wheel, * hrtimers provide finer resolution and accuracy depending on system * configuration and capabilities. * * Started by: Thomas Gleixner and Ingo Molnar * * Credits: * Based on the original timer wheel code * * Help, testing, suggestions, bugfixes, improvements were * provided by: * * George Anzinger, Andrew Morton, Steven Rostedt, Roman Zippel * et. al. */ #include <linux/cpu.h> #include <linux/export.h> #include <linux/percpu.h> #include <linux/hrtimer.h> #include <linux/notifier.h> #include <linux/syscalls.h> #include <linux/interrupt.h> #include <linux/tick.h> #include <linux/err.h> #include <linux/debugobjects.h> #include <linux/sched/signal.h> #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> #include <linux/sched/nohz.h> #include <linux/sched/debug.h> #include <linux/timer.h> #include <linux/freezer.h> #include <linux/compat.h> #include <linux/uaccess.h> #include <trace/events/timer.h> #include "tick-internal.h" /* * Masks for selecting the soft and hard context timers from * cpu_base->active */ #define MASK_SHIFT (HRTIMER_BASE_MONOTONIC_SOFT) #define HRTIMER_ACTIVE_HARD ((1U << MASK_SHIFT) - 1) #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) /* * The timer bases: * * There are more clockids than hrtimer bases. Thus, we index * into the timer bases by the hrtimer_base_type enum. When trying * to reach a base using a clockid, hrtimer_clockid_to_base() * is used to convert from clockid to the proper hrtimer_base_type. */ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { .index = HRTIMER_BASE_MONOTONIC, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, { .index = HRTIMER_BASE_MONOTONIC_SOFT, .clockid = CLOCK_MONOTONIC, .get_time = &ktime_get, }, { .index = HRTIMER_BASE_REALTIME_SOFT, .clockid = CLOCK_REALTIME, .get_time = &ktime_get_real, }, { .index = HRTIMER_BASE_BOOTTIME_SOFT, .clockid = CLOCK_BOOTTIME, .get_time = &ktime_get_boottime, }, { .index = HRTIMER_BASE_TAI_SOFT, .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { /* Make sure we catch unsupported clockids */ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES, [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME, [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC, [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME, [CLOCK_TAI] = HRTIMER_BASE_TAI, }; /* * Functions and macros which are different for UP/SMP systems are kept in a * single place */ #ifdef CONFIG_SMP /* * We require the migration_base for lock_hrtimer_base()/switch_hrtimer_base() * such that hrtimer_callback_running() can unconditionally dereference * timer->base->cpu_base */ static struct hrtimer_cpu_base migration_cpu_base = { .clock_base = { { .cpu_base = &migration_cpu_base, .seq = SEQCNT_RAW_SPINLOCK_ZERO(migration_cpu_base.seq, &migration_cpu_base.lock), }, }, }; #define migration_base migration_cpu_base.clock_base[0] static inline bool is_migration_base(struct hrtimer_clock_base *base) { return base == &migration_base; } /* * We are using hashed locking: holding per_cpu(hrtimer_bases)[n].lock * means that all timers which are tied to this base via timer->base are * locked, and the base itself is locked too. * * So __run_timers/migrate_timers can safely modify all timers which could * be found on the lists/queues. * * When the timer's base is locked, and the timer removed from list, it is * possible to set timer->base = &migration_base and drop the lock: the timer * remains locked. */ static struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base; for (;;) { base = READ_ONCE(timer->base); if (likely(base != &migration_base)) { raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); if (likely(base == timer->base)) return base; /* The timer has migrated to another CPU: */ raw_spin_unlock_irqrestore(&base->cpu_base->lock, *flags); } cpu_relax(); } } /* * We do not migrate the timer when it is expiring before the next * event on the target cpu. When high resolution is enabled, we cannot * reprogram the target cpu hardware and we would cause it to fire * late. To keep it simple, we handle the high resolution enabled and * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ static int hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); return expires < new_base->cpu_base->expires_next; } static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); #endif return base; } /* * We switch the timer base to a power-optimized selected CPU target, * if: * - NO_HZ_COMMON is enabled * - timer migration is enabled * - the timer callback is not running * - the timer is not the first expiring timer on the new target * * If one of the above requirements is not fulfilled we move the timer * to the current CPU or leave it on the previously assigned CPU if * the timer callback is currently running. */ static inline struct hrtimer_clock_base * switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base, int pinned) { struct hrtimer_cpu_base *new_cpu_base, *this_cpu_base; struct hrtimer_clock_base *new_base; int basenum = base->index; this_cpu_base = this_cpu_ptr(&hrtimer_bases); new_cpu_base = get_target_base(this_cpu_base, pinned); again: new_base = &new_cpu_base->clock_base[basenum]; if (base != new_base) { /* * We are trying to move timer to new_base. * However we can't change timer's base while it is running, * so we keep it on the same CPU. No hassle vs. reprogramming * the event source in the high resolution case. The softirq * code will take care of this when the timer function has * completed. There is no conflict as we hold the lock until * the timer is enqueued. */ if (unlikely(hrtimer_callback_running(timer))) return base; /* See the comment in lock_hrtimer_base() */ WRITE_ONCE(timer->base, &migration_base); raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; WRITE_ONCE(timer->base, base); goto again; } WRITE_ONCE(timer->base, new_base); } else { if (new_cpu_base != this_cpu_base && hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } } return new_base; } #else /* CONFIG_SMP */ static inline bool is_migration_base(struct hrtimer_clock_base *base) { return false; } static inline struct hrtimer_clock_base * lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { struct hrtimer_clock_base *base = timer->base; raw_spin_lock_irqsave(&base->cpu_base->lock, *flags); return base; } # define switch_hrtimer_base(t, b, p) (b) #endif /* !CONFIG_SMP */ /* * Functions for the union type storage format of ktime_t which are * too large for inlining: */ #if BITS_PER_LONG < 64 /* * Divide a ktime value by a nanosecond value */ s64 __ktime_divns(const ktime_t kt, s64 div) { int sft = 0; s64 dclc; u64 tmp; dclc = ktime_to_ns(kt); tmp = dclc < 0 ? -dclc : dclc; /* Make sure the divisor is less than 2^32: */ while (div >> 32) { sft++; div >>= 1; } tmp >>= sft; do_div(tmp, (u32) div); return dclc < 0 ? -tmp : tmp; } EXPORT_SYMBOL_GPL(__ktime_divns); #endif /* BITS_PER_LONG >= 64 */ /* * Add two ktime values and do a safety check for overflow: */ ktime_t ktime_add_safe(const ktime_t lhs, const ktime_t rhs) { ktime_t res = ktime_add_unsafe(lhs, rhs); /* * We use KTIME_SEC_MAX here, the maximum timeout which we can * return to user space in a timespec: */ if (res < 0 || res < lhs || res < rhs) res = ktime_set(KTIME_SEC_MAX, 0); return res; } EXPORT_SYMBOL_GPL(ktime_add_safe); #ifdef CONFIG_DEBUG_OBJECTS_TIMERS static const struct debug_obj_descr hrtimer_debug_descr; static void *hrtimer_debug_hint(void *addr) { return ((struct hrtimer *) addr)->function; } /* * fixup_init is called when: * - an active object is initialized */ static bool hrtimer_fixup_init(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_init(timer, &hrtimer_debug_descr); return true; default: return false; } } /* * fixup_activate is called when: * - an active object is activated * - an unknown non-static object is activated */ static bool hrtimer_fixup_activate(void *addr, enum debug_obj_state state) { switch (state) { case ODEBUG_STATE_ACTIVE: WARN_ON(1); fallthrough; default: return false; } } /* * fixup_free is called when: * - an active object is freed */ static bool hrtimer_fixup_free(void *addr, enum debug_obj_state state) { struct hrtimer *timer = addr; switch (state) { case ODEBUG_STATE_ACTIVE: hrtimer_cancel(timer); debug_object_free(timer, &hrtimer_debug_descr); return true; default: return false; } } static const struct debug_obj_descr hrtimer_debug_descr = { .name = "hrtimer", .debug_hint = hrtimer_debug_hint, .fixup_init = hrtimer_fixup_init, .fixup_activate = hrtimer_fixup_activate, .fixup_free = hrtimer_fixup_free, }; static inline void debug_hrtimer_init(struct hrtimer *timer) { debug_object_init(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_object_activate(timer, &hrtimer_debug_descr); } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { debug_object_deactivate(timer, &hrtimer_debug_descr); } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_on_stack(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(timer, &hrtimer_debug_descr); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_on_stack); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode); void hrtimer_init_sleeper_on_stack(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_object_init_on_stack(&sl->timer, &hrtimer_debug_descr); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper_on_stack); void destroy_hrtimer_on_stack(struct hrtimer *timer) { debug_object_free(timer, &hrtimer_debug_descr); } EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack); #else static inline void debug_hrtimer_init(struct hrtimer *timer) { } static inline void debug_hrtimer_activate(struct hrtimer *timer, enum hrtimer_mode mode) { } static inline void debug_hrtimer_deactivate(struct hrtimer *timer) { } #endif static inline void debug_init(struct hrtimer *timer, clockid_t clockid, enum hrtimer_mode mode) { debug_hrtimer_init(timer); trace_hrtimer_init(timer, clockid, mode); } static inline void debug_activate(struct hrtimer *timer, enum hrtimer_mode mode) { debug_hrtimer_activate(timer, mode); trace_hrtimer_start(timer, mode); } static inline void debug_deactivate(struct hrtimer *timer) { debug_hrtimer_deactivate(timer); trace_hrtimer_cancel(timer); } static struct hrtimer_clock_base * __next_base(struct hrtimer_cpu_base *cpu_base, unsigned int *active) { unsigned int idx; if (!*active) return NULL; idx = __ffs(*active); *active &= ~(1U << idx); return &cpu_base->clock_base[idx]; } #define for_each_active_base(base, cpu_base, active) \ while ((base = __next_base((cpu_base), &(active)))) static ktime_t __hrtimer_next_event_base(struct hrtimer_cpu_base *cpu_base, const struct hrtimer *exclude, unsigned int active, ktime_t expires_next) { struct hrtimer_clock_base *base; ktime_t expires; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *next; struct hrtimer *timer; next = timerqueue_getnext(&base->active); timer = container_of(next, struct hrtimer, node); if (timer == exclude) { /* Get to the next timer in the queue. */ next = timerqueue_iterate_next(next); if (!next) continue; timer = container_of(next, struct hrtimer, node); } expires = ktime_sub(hrtimer_get_expires(timer), base->offset); if (expires < expires_next) { expires_next = expires; /* Skip cpu_base update if a timer is being excluded. */ if (exclude) continue; if (timer->is_soft) cpu_base->softirq_next_timer = timer; else cpu_base->next_timer = timer; } } /* * clock_was_set() might have changed base->offset of any of * the clock bases so the result might be negative. Fix it up * to prevent a false positive in clockevents_program_event(). */ if (expires_next < 0) expires_next = 0; return expires_next; } /* * Recomputes cpu_base::*next_timer and returns the earliest expires_next * but does not set cpu_base::*expires_next, that is done by * hrtimer[_force]_reprogram and hrtimer_interrupt only. When updating * cpu_base::*expires_next right away, reprogramming logic would no longer * work. * * When a softirq is pending, we can ignore the HRTIMER_ACTIVE_SOFT bases, * those timers will get run whenever the softirq gets handled, at the end of * hrtimer_run_softirq(), hrtimer_update_softirq_timer() will re-add these bases. * * Therefore softirq values are those from the HRTIMER_ACTIVE_SOFT clock bases. * The !softirq values are the minima across HRTIMER_ACTIVE_ALL, unless an actual * softirq is pending, in which case they're the minima of HRTIMER_ACTIVE_HARD. * * @active_mask must be one of: * - HRTIMER_ACTIVE_ALL, * - HRTIMER_ACTIVE_SOFT, or * - HRTIMER_ACTIVE_HARD. */ static ktime_t __hrtimer_get_next_event(struct hrtimer_cpu_base *cpu_base, unsigned int active_mask) { unsigned int active; struct hrtimer *next_timer = NULL; ktime_t expires_next = KTIME_MAX; if (!cpu_base->softirq_activated && (active_mask & HRTIMER_ACTIVE_SOFT)) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; cpu_base->softirq_next_timer = NULL; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, KTIME_MAX); next_timer = cpu_base->softirq_next_timer; } if (active_mask & HRTIMER_ACTIVE_HARD) { active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; cpu_base->next_timer = next_timer; expires_next = __hrtimer_next_event_base(cpu_base, NULL, active, expires_next); } return expires_next; } static ktime_t hrtimer_update_next_event(struct hrtimer_cpu_base *cpu_base) { ktime_t expires_next, soft = KTIME_MAX; /* * If the soft interrupt has already been activated, ignore the * soft bases. They will be handled in the already raised soft * interrupt. */ if (!cpu_base->softirq_activated) { soft = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * Update the soft expiry time. clock_settime() might have * affected it. */ cpu_base->softirq_expires_next = soft; } expires_next = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_HARD); /* * If a softirq timer is expiring first, update cpu_base->next_timer * and program the hardware with the soft expiry time. */ if (expires_next > soft) { cpu_base->next_timer = cpu_base->softirq_next_timer; expires_next = soft; } return expires_next; } static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) { ktime_t *offs_real = &base->clock_base[HRTIMER_BASE_REALTIME].offset; ktime_t *offs_boot = &base->clock_base[HRTIMER_BASE_BOOTTIME].offset; ktime_t *offs_tai = &base->clock_base[HRTIMER_BASE_TAI].offset; ktime_t now = ktime_get_update_offsets_now(&base->clock_was_set_seq, offs_real, offs_boot, offs_tai); base->clock_base[HRTIMER_BASE_REALTIME_SOFT].offset = *offs_real; base->clock_base[HRTIMER_BASE_BOOTTIME_SOFT].offset = *offs_boot; base->clock_base[HRTIMER_BASE_TAI_SOFT].offset = *offs_tai; return now; } /* * Is the high resolution mode active ? */ static inline int __hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? cpu_base->hres_active : 0; } static inline int hrtimer_hres_active(void) { return __hrtimer_hres_active(this_cpu_ptr(&hrtimer_bases)); } /* * Reprogram the event source with checking both queues for the * next event * Called with interrupts disabled and base->lock held */ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base, int skip_equal) { ktime_t expires_next; expires_next = hrtimer_update_next_event(cpu_base); if (skip_equal && expires_next == cpu_base->expires_next) return; cpu_base->expires_next = expires_next; /* * If hres is not active, hardware does not have to be * reprogrammed yet. * * If a hang was detected in the last timer interrupt then we * leave the hang delay active in the hardware. We want the * system to make progress. That also prevents the following * scenario: * T1 expires 50ms from now * T2 expires 5s from now * * T1 is removed, so this code is called and would reprogram * the hardware to 5s from now. Any hrtimer_start after that * will not reprogram the hardware due to hang_detected being * set. So we'd effectivly block all timers until the T2 event * fires. */ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; tick_program_event(cpu_base->expires_next, 1); } /* High resolution timer related functions */ #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer enabled ? */ static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); /* * Enable / Disable high resolution mode */ static int __init setup_hrtimer_hres(char *str) { return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); /* * hrtimer_high_res_enabled - query, if the highres mode is enabled */ static inline int hrtimer_is_hres_enabled(void) { return hrtimer_hres_enabled; } /* * Retrigger next event is called after clock was set * * Called with interrupts disabled via on_each_cpu() */ static void retrigger_next_event(void *arg) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (!__hrtimer_hres_active(base)) return; raw_spin_lock(&base->lock); hrtimer_update_base(base); hrtimer_force_reprogram(base, 0); raw_spin_unlock(&base->lock); } /* * Switch to high resolution mode */ static void hrtimer_switch_to_hres(void) { struct hrtimer_cpu_base *base = this_cpu_ptr(&hrtimer_bases); if (tick_init_highres()) { pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } base->hres_active = 1; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); } #else static inline int hrtimer_is_hres_enabled(void) { return 0; } static inline void hrtimer_switch_to_hres(void) { } static inline void retrigger_next_event(void *arg) { } #endif /* CONFIG_HIGH_RES_TIMERS */ /* * When a timer is enqueued and expires earlier than the already enqueued * timers, we have to check, whether it expires earlier than the timer for * which the clock event device was armed. * * Called with interrupts disabled and base->cpu_base.lock held */ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *base = timer->base; ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset); WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0); /* * CLOCK_REALTIME timer might be requested with an absolute * expiry time which is less than base->offset. Set it to 0. */ if (expires < 0) expires = 0; if (timer->is_soft) { /* * soft hrtimer could be started on a remote CPU. In this * case softirq_expires_next needs to be updated on the * remote CPU. The soft hrtimer will not expire before the * first hard hrtimer on the remote CPU - * hrtimer_check_target() prevents this case. */ struct hrtimer_cpu_base *timer_cpu_base = base->cpu_base; if (timer_cpu_base->softirq_activated) return; if (!ktime_before(expires, timer_cpu_base->softirq_expires_next)) return; timer_cpu_base->softirq_next_timer = timer; timer_cpu_base->softirq_expires_next = expires; if (!ktime_before(expires, timer_cpu_base->expires_next) || !reprogram) return; } /* * If the timer is not on the current cpu, we cannot reprogram * the other cpus clock event device. */ if (base->cpu_base != cpu_base) return; /* * If the hrtimer interrupt is running, then it will * reevaluate the clock bases and reprogram the clock event * device. The callbacks are always executed in hard interrupt * context so we don't need an extra check for a running * callback. */ if (cpu_base->in_hrtirq) return; if (expires >= cpu_base->expires_next) return; /* Update the pointer to the next expiring timer */ cpu_base->next_timer = timer; cpu_base->expires_next = expires; /* * If hres is not active, hardware does not have to be * programmed yet. * * If a hang was detected in the last timer interrupt then we * do not schedule a timer which is earlier than the expiry * which we enforced in the hang detection. We want the system * to make progress. */ if (!__hrtimer_hres_active(cpu_base) || cpu_base->hang_detected) return; /* * Program the timer hardware. We enforce the expiry for * events which are already in the past. */ tick_program_event(expires, 1); } /* * Clock realtime was set * * Change the offset of the realtime clock vs. the monotonic * clock. * * We might have to reprogram the high resolution timer interrupt. On * SMP we call the architecture specific code to retrigger _all_ high * resolution timer interrupts. On UP we just disable interrupts and * call the high resolution interrupt code. */ void clock_was_set(void) { #ifdef CONFIG_HIGH_RES_TIMERS /* Retrigger the CPU local events everywhere */ on_each_cpu(retrigger_next_event, NULL, 1); #endif timerfd_clock_was_set(); } static void clock_was_set_work(struct work_struct *work) { clock_was_set(); } static DECLARE_WORK(hrtimer_work, clock_was_set_work); /* * Called from timekeeping and resume code to reprogram the hrtimer * interrupt device on all cpus and to notify timerfd. */ void clock_was_set_delayed(void) { schedule_work(&hrtimer_work); } /* * During resume we might have to reprogram the high resolution timer * interrupt on all online CPUs. However, all other CPUs will be * stopped with IRQs interrupts disabled so the clock_was_set() call * must be deferred. */ void hrtimers_resume(void) { lockdep_assert_irqs_disabled(); /* Retrigger on the local CPU */ retrigger_next_event(NULL); /* And schedule a retrigger for all others */ clock_was_set_delayed(); } /* * Counterpart to lock_hrtimer_base above: */ static inline void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags) { raw_spin_unlock_irqrestore(&timer->base->cpu_base->lock, *flags); } /** * hrtimer_forward - forward the timer expiry * @timer: hrtimer to forward * @now: forward past this time * @interval: the interval to forward * * Forward the timer expiry so it will expire in the future. * Returns the number of overruns. * * Can be safely called from the callback function of @timer. If * called from other contexts @timer must neither be enqueued nor * running the callback and the caller needs to take care of * serialization. * * Note: This only updates the timer expiry value and does not requeue * the timer. */ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) { u64 orun = 1; ktime_t delta; delta = ktime_sub(now, hrtimer_get_expires(timer)); if (delta < 0) return 0; if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) return 0; if (interval < hrtimer_resolution) interval = hrtimer_resolution; if (unlikely(delta >= interval)) { s64 incr = ktime_to_ns(interval); orun = ktime_divns(delta, incr); hrtimer_add_expires_ns(timer, incr * orun); if (hrtimer_get_expires_tv64(timer) > now) return orun; /* * This (and the ktime_add() below) is the * correction for exact: */ orun++; } hrtimer_add_expires(timer, interval); return orun; } EXPORT_SYMBOL_GPL(hrtimer_forward); /* * enqueue_hrtimer - internal function to (re)start a timer * * The timer is inserted in expiry order. Insertion into the * red black tree is O(log(n)). Must hold the base lock. * * Returns 1 when the new timer is the leftmost timer in the tree. */ static int enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, enum hrtimer_mode mode) { debug_activate(timer, mode); base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } /* * __remove_hrtimer - internal function to remove a timer * * Caller must hold the base lock. * * High resolution timer mode reprograms the clock event device when the * timer is the one which expires next. The caller can disable this by setting * reprogram to zero. This is useful, when the context does a reprogramming * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; u8 state = timer->state; /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->state, newstate); if (!(state & HRTIMER_STATE_ENQUEUED)) return; if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* * Note: If reprogram is false we do not update * cpu_base->next_timer. This happens when we remove the first * timer on a remote cpu. No harm as we never dereference * cpu_base->next_timer. So the worst thing what can happen is * an superflous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ if (reprogram && timer == cpu_base->next_timer) hrtimer_force_reprogram(cpu_base, 1); } /* * remove hrtimer, called with base lock held */ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { u8 state = timer->state; if (state & HRTIMER_STATE_ENQUEUED) { bool reprogram; /* * Remove the timer and force reprogramming when high * resolution mode is active and the timer is on the current * CPU. If we remove a timer on another CPU, reprogramming is * skipped. The interrupt event on this CPU is fired and * reprogramming happens in the interrupt handler. This is a * rare case and less expensive than a smp call. */ debug_deactivate(timer); reprogram = base->cpu_base == this_cpu_ptr(&hrtimer_bases); /* * If the timer is not restarted then reprogramming is * required if the timer is local. If it is local and about * to be restarted, avoid programming it twice (on removal * and a moment later when it's requeued). */ if (!restart) state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; __remove_hrtimer(timer, base, state, reprogram); return 1; } return 0; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) { #ifdef CONFIG_TIME_LOW_RES /* * CONFIG_TIME_LOW_RES indicates that the system has no way to return * granular time values. For relative timers we add hrtimer_resolution * (i.e. one jiffie) to prevent short timeouts. */ timer->is_rel = mode & HRTIMER_MODE_REL; if (timer->is_rel) tim = ktime_add_safe(tim, hrtimer_resolution); #endif return tim; } static void hrtimer_update_softirq_timer(struct hrtimer_cpu_base *cpu_base, bool reprogram) { ktime_t expires; /* * Find the next SOFT expiration. */ expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_SOFT); /* * reprogramming needs to be triggered, even if the next soft * hrtimer expires at the same time than the next hard * hrtimer. cpu_base->softirq_expires_next needs to be updated! */ if (expires == KTIME_MAX) return; /* * cpu_base->*next_timer is recomputed by __hrtimer_get_next_event() * cpu_base->*expires_next is only set by hrtimer_reprogram() */ hrtimer_reprogram(cpu_base->softirq_next_timer, reprogram); } static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { struct hrtimer_clock_base *new_base; bool force_local, first; /* * If the timer is on the local cpu base and is the first expiring * timer then this might end up reprogramming the hardware twice * (on removal and on enqueue). To avoid that by prevent the * reprogram on removal, keep the timer local to the current CPU * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the * remote data correctly. * * If it's on the current CPU and the first expiring timer, then * skip reprogramming, keep the timer local and enforce * reprogramming later if it was the first expiring timer. This * avoids programming the underlying clock event twice (once at * removal and once after enqueue). */ remove_hrtimer(timer, base, true, force_local); if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); /* Switch the timer base, if necessary: */ if (!force_local) { new_base = switch_hrtimer_base(timer, base, mode & HRTIMER_MODE_PINNED); } else { new_base = base; } first = enqueue_hrtimer(timer, new_base, mode); if (!force_local) return first; /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the * hardware by evaluating the new first expiring timer. */ hrtimer_force_reprogram(new_base->cpu_base, 1); return 0; } /** * hrtimer_start_range_ns - (re)start an hrtimer * @timer: the timer to be added * @tim: expiry time * @delta_ns: "slack" range for the timer * @mode: timer mode: absolute (HRTIMER_MODE_ABS) or * relative (HRTIMER_MODE_REL), and pinned (HRTIMER_MODE_PINNED); * softirq based mode is considered for debug purpose only! */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base; unsigned long flags; /* * Check whether the HRTIMER_MODE_SOFT bit and hrtimer.is_soft * match on CONFIG_PREEMPT_RT = n. With PREEMPT_RT check the hard * expiry mode because unmarked timers are moved to softirq expiry. */ if (!IS_ENABLED(CONFIG_PREEMPT_RT)) WARN_ON_ONCE(!(mode & HRTIMER_MODE_SOFT) ^ !timer->is_soft); else WARN_ON_ONCE(!(mode & HRTIMER_MODE_HARD) ^ !timer->is_hard); base = lock_hrtimer_base(timer, &flags); if (__hrtimer_start_range_ns(timer, tim, delta_ns, mode, base)) hrtimer_reprogram(timer, true); unlock_hrtimer_base(timer, &flags); } EXPORT_SYMBOL_GPL(hrtimer_start_range_ns); /** * hrtimer_try_to_cancel - try to deactivate a timer * @timer: hrtimer to stop * * Returns: * * * 0 when the timer was not active * * 1 when the timer was active * * -1 when the timer is currently executing the callback function and * cannot be stopped */ int hrtimer_try_to_cancel(struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned long flags; int ret = -1; /* * Check lockless first. If the timer is not active (neither * enqueued nor running the callback, nothing to do here. The * base lock does not serialize against a concurrent enqueue, * so we can avoid taking it. */ if (!hrtimer_active(timer)) return 0; base = lock_hrtimer_base(timer, &flags); if (!hrtimer_callback_running(timer)) ret = remove_hrtimer(timer, base, false, false); unlock_hrtimer_base(timer, &flags); return ret; } EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel); #ifdef CONFIG_PREEMPT_RT static void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { spin_lock_init(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { spin_lock(&base->softirq_expiry_lock); } static void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { spin_unlock(&base->softirq_expiry_lock); } /* * The counterpart to hrtimer_cancel_wait_running(). * * If there is a waiter for cpu_base->expiry_lock, then it was waiting for * the timer callback to finish. Drop expiry_lock and reaquire it. That * allows the waiter to acquire the lock and make progress. */ static void hrtimer_sync_wait_running(struct hrtimer_cpu_base *cpu_base, unsigned long flags) { if (atomic_read(&cpu_base->timer_waiters)) { raw_spin_unlock_irqrestore(&cpu_base->lock, flags); spin_unlock(&cpu_base->softirq_expiry_lock); spin_lock(&cpu_base->softirq_expiry_lock); raw_spin_lock_irq(&cpu_base->lock); } } /* * This function is called on PREEMPT_RT kernels when the fast path * deletion of a timer failed because the timer callback function was * running. * * This prevents priority inversion: if the soft irq thread is preempted * in the middle of a timer callback, then calling del_timer_sync() can * lead to two issues: * * - If the caller is on a remote CPU then it has to spin wait for the timer * handler to complete. This can result in unbound priority inversion. * * - If the caller originates from the task which preempted the timer * handler on the same CPU, then spin waiting for the timer handler to * complete is never going to end. */ void hrtimer_cancel_wait_running(const struct hrtimer *timer) { /* Lockless read. Prevent the compiler from reloading it below */ struct hrtimer_clock_base *base = READ_ONCE(timer->base); /* * Just relax if the timer expires in hard interrupt context or if * it is currently on the migration base. */ if (!timer->is_soft || is_migration_base(base)) { cpu_relax(); return; } /* * Mark the base as contended and grab the expiry lock, which is * held by the softirq across the timer callback. Drop the lock * immediately so the softirq can expire the next timer. In theory * the timer could already be running again, but that's more than * unlikely and just causes another wait loop. */ atomic_inc(&base->cpu_base->timer_waiters); spin_lock_bh(&base->cpu_base->softirq_expiry_lock); atomic_dec(&base->cpu_base->timer_waiters); spin_unlock_bh(&base->cpu_base->softirq_expiry_lock); } #else static inline void hrtimer_cpu_base_init_expiry_lock(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_lock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_cpu_base_unlock_expiry(struct hrtimer_cpu_base *base) { } static inline void hrtimer_sync_wait_running(struct hrtimer_cpu_base *base, unsigned long flags) { } #endif /** * hrtimer_cancel - cancel a timer and wait for the handler to finish. * @timer: the timer to be cancelled * * Returns: * 0 when the timer was not active * 1 when the timer was active */ int hrtimer_cancel(struct hrtimer *timer) { int ret; do { ret = hrtimer_try_to_cancel(timer); if (ret < 0) hrtimer_cancel_wait_running(timer); } while (ret < 0); return ret; } EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) rem = hrtimer_expires_remaining_adjusted(timer); else rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** * hrtimer_get_next_event - get the time until next expiry event * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ u64 hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (!__hrtimer_hres_active(cpu_base)) expires = __hrtimer_get_next_event(cpu_base, HRTIMER_ACTIVE_ALL); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } /** * hrtimer_next_event_without - time until next expiry event w/o one timer * @exclude: timer to exclude * * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ u64 hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); u64 expires = KTIME_MAX; unsigned long flags; raw_spin_lock_irqsave(&cpu_base->lock, flags); if (__hrtimer_hres_active(cpu_base)) { unsigned int active; if (!cpu_base->softirq_activated) { active = cpu_base->active_bases & HRTIMER_ACTIVE_SOFT; expires = __hrtimer_next_event_base(cpu_base, exclude, active, KTIME_MAX); } active = cpu_base->active_bases & HRTIMER_ACTIVE_HARD; expires = __hrtimer_next_event_base(cpu_base, exclude, active, expires); } raw_spin_unlock_irqrestore(&cpu_base->lock, flags); return expires; } #endif static inline int hrtimer_clockid_to_base(clockid_t clock_id) { if (likely(clock_id < MAX_CLOCKS)) { int base = hrtimer_clock_to_base_table[clock_id]; if (likely(base != HRTIMER_MAX_CLOCK_BASES)) return base; } WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id); return HRTIMER_BASE_MONOTONIC; } static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { bool softtimer = !!(mode & HRTIMER_MODE_SOFT); struct hrtimer_cpu_base *cpu_base; int base; /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitely * marked for hard interrupt expiry mode are moved into soft * interrupt context for latency reasons and because the callbacks * can invoke functions which might sleep on RT, e.g. spin_lock(). */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(mode & HRTIMER_MODE_HARD)) softtimer = true; memset(timer, 0, sizeof(struct hrtimer)); cpu_base = raw_cpu_ptr(&hrtimer_bases); /* * POSIX magic: Relative CLOCK_REALTIME timers are not affected by * clock modifications, so they needs to become CLOCK_MONOTONIC to * ensure POSIX compliance. */ if (clock_id == CLOCK_REALTIME && mode & HRTIMER_MODE_REL) clock_id = CLOCK_MONOTONIC; base = softtimer ? HRTIMER_MAX_CLOCK_BASES / 2 : 0; base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); } /** * hrtimer_init - initialize a timer to the given clock * @timer: the timer to be initialized * @clock_id: the clock to be used * @mode: The modes which are relevant for intitialization: * HRTIMER_MODE_ABS, HRTIMER_MODE_REL, HRTIMER_MODE_ABS_SOFT, * HRTIMER_MODE_REL_SOFT * * The PINNED variants of the above can be handed in, * but the PINNED bit is ignored as pinning happens * when the hrtimer is started */ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(timer, clock_id, mode); __hrtimer_init(timer, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init); /* * A timer is active, when it is enqueued into the rbtree or the * callback function is running or it's in the state of being migrated * to another cpu. * * It is important for this function to not return a false negative. */ bool hrtimer_active(const struct hrtimer *timer) { struct hrtimer_clock_base *base; unsigned int seq; do { base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); return false; } EXPORT_SYMBOL_GPL(hrtimer_active); /* * The write_seqcount_barrier()s in __run_hrtimer() split the thing into 3 * distinct sections: * * - queued: the timer is queued * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * * On the read side we ensure we observe timer->state and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. * * The sequence numbers are required because otherwise we could still observe * a false negative if the read side got smeared over multiple consequtive * __run_hrtimer() invocations. */ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_clock_base *base, struct hrtimer *timer, ktime_t *now, unsigned long flags) __must_hold(&cpu_base->lock) { enum hrtimer_restart (*fn)(struct hrtimer *); bool expires_in_hardirq; int restart; lockdep_assert_held(&cpu_base->lock); debug_deactivate(timer); base->running = timer; /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE, 0); fn = timer->function; /* * Clear the 'is relative' flag for the TIME_LOW_RES case. If the * timer is restarted with a period then it becomes an absolute * timer. If its not restarted it does not matter. */ if (IS_ENABLED(CONFIG_TIME_LOW_RES)) timer->is_rel = false; /* * The timer is marked as running in the CPU base, so it is * protected against migration to a different CPU even if the lock * is dropped. */ raw_spin_unlock_irqrestore(&cpu_base->lock, flags); trace_hrtimer_expire_entry(timer, now); expires_in_hardirq = lockdep_hrtimer_enter(timer); restart = fn(timer); lockdep_hrtimer_exit(expires_in_hardirq); trace_hrtimer_expire_exit(timer); raw_spin_lock_irq(&cpu_base->lock); /* * Note: We clear the running state after enqueue_hrtimer and * we do not reprogram the event hardware. Happens either in * hrtimer_start_range_ns() or in hrtimer_interrupt() * * Note: Because we dropped the cpu_base->lock above, * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS); /* * Separate the ->running assignment from the ->state assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && * timer->state == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); WARN_ON_ONCE(base->running != timer); base->running = NULL; } static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, unsigned long flags, unsigned int active_mask) { struct hrtimer_clock_base *base; unsigned int active = cpu_base->active_bases & active_mask; for_each_active_base(base, cpu_base, active) { struct timerqueue_node *node; ktime_t basenow; basenow = ktime_add(now, base->offset); while ((node = timerqueue_getnext(&base->active))) { struct hrtimer *timer; timer = container_of(node, struct hrtimer, node); /* * The immediate goal for using the softexpires is * minimizing wakeups, not running timers at the * earliest interrupt after their soft expiration. * This allows us to avoid using a Priority Search * Tree, which can answer a stabbing querry for * overlapping intervals and instead use the simple * BST we already have. * We don't add extra wakeups by delaying timers that * are right-of a not yet expired timer, because that * timer will have to trigger a wakeup anyway. */ if (basenow < hrtimer_get_softexpires_tv64(timer)) break; __run_hrtimer(cpu_base, base, timer, &basenow, flags); if (active_mask == HRTIMER_ACTIVE_SOFT) hrtimer_sync_wait_running(cpu_base, flags); } } } static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; hrtimer_cpu_base_lock_expiry(cpu_base); raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); cpu_base->softirq_activated = 0; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); hrtimer_cpu_base_unlock_expiry(cpu_base); } #ifdef CONFIG_HIGH_RES_TIMERS /* * High resolution timer interrupt * Called with interrupts disabled */ void hrtimer_interrupt(struct clock_event_device *dev) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); ktime_t expires_next, now, entry_time, delta; unsigned long flags; int retries = 0; BUG_ON(!cpu_base->hres_active); cpu_base->nr_events++; dev->next_event = KTIME_MAX; raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: cpu_base->in_hrtirq = 1; /* * We set expires_next to KTIME_MAX here with cpu_base->lock * held to prevent that a timer is enqueued in our queue via * the migration code. This does not affect enqueueing of * timers which run their callback and need to be requeued on * this CPU. */ cpu_base->expires_next = KTIME_MAX; if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); /* Reevaluate the clock bases for the [soft] next expiry */ expires_next = hrtimer_update_next_event(cpu_base); /* * Store the new expiry value so the migration code can verify * against it. */ cpu_base->expires_next = expires_next; cpu_base->in_hrtirq = 0; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { cpu_base->hang_detected = 0; return; } /* * The next timer was already expired due to: * - tracing * - long lasting callbacks * - being scheduled away when running in a VM * * We need to prevent that we loop forever in the hrtimer * interrupt routine. We give it 3 attempts to avoid * overreacting on some spurious event. * * Acquire base lock for updating the offsets and retrieving * the current time. */ raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); cpu_base->nr_retries++; if (++retries < 3) goto retry; /* * Give the system a chance to do something else than looping * here. We stored the entry time, so we know exactly how long * we spent here. We schedule the next event this amount of * time away. */ cpu_base->nr_hangs++; cpu_base->hang_detected = 1; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); if ((unsigned int)delta > cpu_base->max_hang_time) cpu_base->max_hang_time = (unsigned int) delta; /* * Limit it to a sensible value as we enforce a longer * delay. Give the CPU at least 100ms to catch up. */ if (delta > 100 * NSEC_PER_MSEC) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); else expires_next = ktime_add(now, delta); tick_program_event(expires_next, 1); pr_warn_once("hrtimer: interrupt took %llu ns\n", ktime_to_ns(delta)); } /* called with interrupts disabled */ static inline void __hrtimer_peek_ahead_timers(void) { struct tick_device *td; if (!hrtimer_hres_active()) return; td = this_cpu_ptr(&tick_cpu_device); if (td && td->evtdev) hrtimer_interrupt(td->evtdev); } #else /* CONFIG_HIGH_RES_TIMERS */ static inline void __hrtimer_peek_ahead_timers(void) { } #endif /* !CONFIG_HIGH_RES_TIMERS */ /* * Called from run_local_timers in hardirq context every jiffy */ void hrtimer_run_queues(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); unsigned long flags; ktime_t now; if (__hrtimer_hres_active(cpu_base)) return; /* * This _is_ ugly: We have to check periodically, whether we * can switch to highres and / or nohz mode. The clocksource * switch happens with xtime_lock held. Notification from * there only sets the check bit in the tick_oneshot code, * otherwise we might deadlock vs. xtime_lock. */ if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) { hrtimer_switch_to_hres(); return; } raw_spin_lock_irqsave(&cpu_base->lock, flags); now = hrtimer_update_base(cpu_base); if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; cpu_base->softirq_activated = 1; raise_softirq_irqoff(HRTIMER_SOFTIRQ); } __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_HARD); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } /* * Sleep related functions: */ static enum hrtimer_restart hrtimer_wakeup(struct hrtimer *timer) { struct hrtimer_sleeper *t = container_of(timer, struct hrtimer_sleeper, timer); struct task_struct *task = t->task; t->task = NULL; if (task) wake_up_process(task); return HRTIMER_NORESTART; } /** * hrtimer_sleeper_start_expires - Start a hrtimer sleeper timer * @sl: sleeper to be started * @mode: timer mode abs/rel * * Wrapper around hrtimer_start_expires() for hrtimer_sleeper based timers * to allow PREEMPT_RT to tweak the delivery mode (soft/hardirq context) */ void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode) { /* * Make the enqueue delivery mode check work on RT. If the sleeper * was initialized for hard interrupt delivery, force the mode bit. * This is a special case for hrtimer_sleepers because * hrtimer_init_sleeper() determines the delivery mode on RT so the * fiddling with this decision is avoided at the call sites. */ if (IS_ENABLED(CONFIG_PREEMPT_RT) && sl->timer.is_hard) mode |= HRTIMER_MODE_HARD; hrtimer_start_expires(&sl->timer, mode); } EXPORT_SYMBOL_GPL(hrtimer_sleeper_start_expires); static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { /* * On PREEMPT_RT enabled kernels hrtimers which are not explicitely * marked for hard interrupt expiry mode are moved into soft * interrupt context either for latency reasons or because the * hrtimer callback takes regular spinlocks or invokes other * functions which are not suitable for hard interrupt context on * PREEMPT_RT. * * The hrtimer_sleeper callback is RT compatible in hard interrupt * context, but there is a latency concern: Untrusted userspace can * spawn many threads which arm timers for the same expiry time on * the same CPU. That causes a latency spike due to the wakeup of * a gazillion threads. * * OTOH, priviledged real-time user space applications rely on the * low latency of hard interrupt wakeups. If the current task is in * a real-time scheduling class, mark the mode for hard interrupt * expiry. */ if (IS_ENABLED(CONFIG_PREEMPT_RT)) { if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) mode |= HRTIMER_MODE_HARD; } __hrtimer_init(&sl->timer, clock_id, mode); sl->timer.function = hrtimer_wakeup; sl->task = current; } /** * hrtimer_init_sleeper - initialize sleeper to the given clock * @sl: sleeper to be initialized * @clock_id: the clock to be used * @mode: timer mode abs/rel */ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, clockid_t clock_id, enum hrtimer_mode mode) { debug_init(&sl->timer, clock_id, mode); __hrtimer_init_sleeper(sl, clock_id, mode); } EXPORT_SYMBOL_GPL(hrtimer_init_sleeper); int nanosleep_copyout(struct restart_block *restart, struct timespec64 *ts) { switch(restart->nanosleep.type) { #ifdef CONFIG_COMPAT_32BIT_TIME case TT_COMPAT: if (put_old_timespec32(ts, restart->nanosleep.compat_rmtp)) return -EFAULT; break; #endif case TT_NATIVE: if (put_timespec64(ts, restart->nanosleep.rmtp)) return -EFAULT; break; default: BUG(); } return -ERESTART_RESTARTBLOCK; } static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mode) { struct restart_block *restart; do { set_current_state(TASK_INTERRUPTIBLE); hrtimer_sleeper_start_expires(t, mode); if (likely(t->task)) freezable_schedule(); hrtimer_cancel(&t->timer); mode = HRTIMER_MODE_ABS; } while (t->task && !signal_pending(current)); __set_current_state(TASK_RUNNING); if (!t->task) return 0; restart = &current->restart_block; if (restart->nanosleep.type != TT_NONE) { ktime_t rem = hrtimer_expires_remaining(&t->timer); struct timespec64 rmt; if (rem <= 0) return 0; rmt = ktime_to_timespec64(rem); return nanosleep_copyout(restart, &rmt); } return -ERESTART_RESTARTBLOCK; } static long __sched hrtimer_nanosleep_restart(struct restart_block *restart) { struct hrtimer_sleeper t; int ret; hrtimer_init_sleeper_on_stack(&t, restart->nanosleep.clockid, HRTIMER_MODE_ABS); hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires); ret = do_nanosleep(&t, HRTIMER_MODE_ABS); destroy_hrtimer_on_stack(&t.timer); return ret; } long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, const clockid_t clockid) { struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) slack = 0; hrtimer_init_sleeper_on_stack(&t, clockid, mode); hrtimer_set_expires_range_ns(&t.timer, rqtp, slack); ret = do_nanosleep(&t, mode); if (ret != -ERESTART_RESTARTBLOCK) goto out; /* Absolute timers do not update the rmtp value and restart: */ if (mode == HRTIMER_MODE_ABS) { ret = -ERESTARTNOHAND; goto out; } restart = &current->restart_block; restart->nanosleep.clockid = t.timer.base->clockid; restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer); set_restart_fn(restart, hrtimer_nanosleep_restart); out: destroy_hrtimer_on_stack(&t.timer); return ret; } #ifdef CONFIG_64BIT SYSCALL_DEFINE2(nanosleep, struct __kernel_timespec __user *, rqtp, struct __kernel_timespec __user *, rmtp) { struct timespec64 tu; if (get_timespec64(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_NATIVE : TT_NONE; current->restart_block.nanosleep.rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif #ifdef CONFIG_COMPAT_32BIT_TIME SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, struct old_timespec32 __user *, rmtp) { struct timespec64 tu; if (get_old_timespec32(&tu, rqtp)) return -EFAULT; if (!timespec64_valid(&tu)) return -EINVAL; current->restart_block.nanosleep.type = rmtp ? TT_COMPAT : TT_NONE; current->restart_block.nanosleep.compat_rmtp = rmtp; return hrtimer_nanosleep(timespec64_to_ktime(tu), HRTIMER_MODE_REL, CLOCK_MONOTONIC); } #endif /* * Functions related to boot-time initialization: */ int hrtimers_prepare_cpu(unsigned int cpu) { struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { struct hrtimer_clock_base *clock_b = &cpu_base->clock_base[i]; clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); timerqueue_init_head(&clock_b->active); } cpu_base->cpu = cpu; cpu_base->active_bases = 0; cpu_base->hres_active = 0; cpu_base->hang_detected = 0; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; hrtimer_cpu_base_init_expiry_lock(cpu_base); return 0; } #ifdef CONFIG_HOTPLUG_CPU static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { struct hrtimer *timer; struct timerqueue_node *node; while ((node = timerqueue_getnext(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_deactivate(timer); /* * Mark it as ENQUEUED not INACTIVE otherwise the * timer could be seen as !active and just vanish away * under us on another CPU */ __remove_hrtimer(timer, old_base, HRTIMER_STATE_ENQUEUED, 0); timer->base = new_base; /* * Enqueue the timers on the new cpu. This does not * reprogram the event device in case the timer * expires before the earliest on this CPU, but we run * hrtimer_interrupt after we migrated everything to * sort out already expired timers and reprogram the * event device. */ enqueue_hrtimer(timer, new_base, HRTIMER_MODE_ABS); } } int hrtimers_dead_cpu(unsigned int scpu) { struct hrtimer_cpu_base *old_base, *new_base; int i; BUG_ON(cpu_online(scpu)); tick_cancel_sched_timer(scpu); /* * this BH disable ensures that raise_softirq_irqoff() does * not wakeup ksoftirqd (and acquire the pi-lock) while * holding the cpu_base lock */ local_bh_disable(); local_irq_disable(); old_base = &per_cpu(hrtimer_bases, scpu); new_base = this_cpu_ptr(&hrtimer_bases); /* * The caller is globally serialized and nobody else * takes two locks at once, deadlock is not possible. */ raw_spin_lock(&new_base->lock); raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { migrate_hrtimer_list(&old_base->clock_base[i], &new_base->clock_base[i]); } /* * The migration might have changed the first expiring softirq * timer on this CPU. Update it. */ hrtimer_update_softirq_timer(new_base, false); raw_spin_unlock(&old_base->lock); raw_spin_unlock(&new_base->lock); /* Check, if we got expired work to do */ __hrtimer_peek_ahead_timers(); local_irq_enable(); local_bh_enable(); return 0; } #endif /* CONFIG_HOTPLUG_CPU */ void __init hrtimers_init(void) { hrtimers_prepare_cpu(smp_processor_id()); open_softirq(HRTIMER_SOFTIRQ, hrtimer_run_softirq); } /** * schedule_hrtimeout_range_clock - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * @clock_id: timer clock to be used */ int __sched schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, clockid_t clock_id) { struct hrtimer_sleeper t; /* * Optimize when a zero timeout value is given. It does not * matter whether this is an absolute or a relative time. */ if (expires && *expires == 0) { __set_current_state(TASK_RUNNING); return 0; } /* * A NULL parameter means "infinite" */ if (!expires) { schedule(); return -EINTR; } hrtimer_init_sleeper_on_stack(&t, clock_id, mode); hrtimer_set_expires_range_ns(&t.timer, *expires, delta); hrtimer_sleeper_start_expires(&t, mode); if (likely(t.task)) schedule(); hrtimer_cancel(&t.timer); destroy_hrtimer_on_stack(&t.timer); __set_current_state(TASK_RUNNING); return !t.task ? 0 : -EINTR; } /** * schedule_hrtimeout_range - sleep until timeout * @expires: timeout value (ktime_t) * @delta: slack in expires timeout (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * The @delta argument gives the kernel the freedom to schedule the * actual wakeup to a time that is both power and performance friendly. * The kernel give the normal best effort behavior for "@expires+@delta", * but may decide to fire the timer earlier, but no earlier than @expires. * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, CLOCK_MONOTONIC); } EXPORT_SYMBOL_GPL(schedule_hrtimeout_range); /** * schedule_hrtimeout - sleep until timeout * @expires: timeout value (ktime_t) * @mode: timer mode * * Make the current task sleep until the given expiry time has * elapsed. The routine will return immediately unless * the current task state has been set (see set_current_state()). * * You can set the task state as follows - * * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to * pass before the routine returns unless the current task is explicitly * woken up, (e.g. by wake_up_process()). * * %TASK_INTERRUPTIBLE - the routine may return early if a signal is * delivered to the current task or the current task is explicitly woken * up. * * The current task state is guaranteed to be TASK_RUNNING when this * routine returns. * * Returns 0 when the timer has expired. If the task was woken before the * timer expired by a signal (only possible in state TASK_INTERRUPTIBLE) or * by an explicit wakeup, it returns -EINTR. */ int __sched schedule_hrtimeout(ktime_t *expires, const enum hrtimer_mode mode) { return schedule_hrtimeout_range(expires, 0, mode); } EXPORT_SYMBOL_GPL(schedule_hrtimeout);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_NET_SCM_H #define __LINUX_NET_SCM_H #include <linux/limits.h> #include <linux/net.h> #include <linux/cred.h> #include <linux/security.h> #include <linux/pid.h> #include <linux/nsproxy.h> #include <linux/sched/signal.h> /* Well, we should have at least one descriptor open * to accept passed FDs 8) */ #define SCM_MAX_FD 253 struct scm_creds { u32 pid; kuid_t uid; kgid_t gid; }; struct scm_fp_list { short count; short max; struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; struct scm_cookie { struct pid *pid; /* Skb credentials */ struct scm_fp_list *fp; /* Passed files */ struct scm_creds creds; /* Skb credentials */ #ifdef CONFIG_SECURITY_NETWORK u32 secid; /* Passed security ID */ #endif }; void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm); void scm_detach_fds_compat(struct msghdr *msg, struct scm_cookie *scm); int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm); void __scm_destroy(struct scm_cookie *scm); struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl); #ifdef CONFIG_SECURITY_NETWORK static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { security_socket_getpeersec_dgram(sock, NULL, &scm->secid); } #else static __inline__ void unix_get_peersec_dgram(struct socket *sock, struct scm_cookie *scm) { } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_set_cred(struct scm_cookie *scm, struct pid *pid, kuid_t uid, kgid_t gid) { scm->pid = get_pid(pid); scm->creds.pid = pid_vnr(pid); scm->creds.uid = uid; scm->creds.gid = gid; } static __inline__ void scm_destroy_cred(struct scm_cookie *scm) { put_pid(scm->pid); scm->pid = NULL; } static __inline__ void scm_destroy(struct scm_cookie *scm) { scm_destroy_cred(scm); if (scm->fp) __scm_destroy(scm); } static __inline__ int scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, bool forcecreds) { memset(scm, 0, sizeof(*scm)); scm->creds.uid = INVALID_UID; scm->creds.gid = INVALID_GID; if (forcecreds) scm_set_cred(scm, task_tgid(current), current_uid(), current_gid()); unix_get_peersec_dgram(sock, scm); if (msg->msg_controllen <= 0) return 0; return __scm_send(sock, msg, scm); } #ifdef CONFIG_SECURITY_NETWORK static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { char *secdata; u32 seclen; int err; if (test_bit(SOCK_PASSSEC, &sock->flags)) { err = security_secid_to_secctx(scm->secid, &secdata, &seclen); if (!err) { put_cmsg(msg, SOL_SOCKET, SCM_SECURITY, seclen, secdata); security_release_secctx(secdata, seclen); } } } #else static inline void scm_passec(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm) { } #endif /* CONFIG_SECURITY_NETWORK */ static __inline__ void scm_recv(struct socket *sock, struct msghdr *msg, struct scm_cookie *scm, int flags) { if (!msg->msg_control) { if (test_bit(SOCK_PASSCRED, &sock->flags) || scm->fp) msg->msg_flags |= MSG_CTRUNC; scm_destroy(scm); return; } if (test_bit(SOCK_PASSCRED, &sock->flags)) { struct user_namespace *current_ns = current_user_ns(); struct ucred ucreds = { .pid = scm->creds.pid, .uid = from_kuid_munged(current_ns, scm->creds.uid), .gid = from_kgid_munged(current_ns, scm->creds.gid), }; put_cmsg(msg, SOL_SOCKET, SCM_CREDENTIALS, sizeof(ucreds), &ucreds); } scm_destroy_cred(scm); scm_passec(sock, msg, scm); if (!scm->fp) return; scm_detach_fds(msg, scm); } #endif /* __LINUX_NET_SCM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 /* SPDX-License-Identifier: GPL-2.0 */ /* * Generic nexthop implementation * * Copyright (c) 2017-19 Cumulus Networks * Copyright (c) 2017-19 David Ahern <dsa@cumulusnetworks.com> */ #ifndef __LINUX_NEXTHOP_H #define __LINUX_NEXTHOP_H #include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/route.h> #include <linux/types.h> #include <net/ip_fib.h> #include <net/ip6_fib.h> #include <net/netlink.h> #define NEXTHOP_VALID_USER_FLAGS RTNH_F_ONLINK struct nexthop; struct nh_config { u32 nh_id; u8 nh_family; u8 nh_protocol; u8 nh_blackhole; u8 nh_fdb; u32 nh_flags; int nh_ifindex; struct net_device *dev; union { __be32 ipv4; struct in6_addr ipv6; } gw; struct nlattr *nh_grp; u16 nh_grp_type; struct nlattr *nh_encap; u16 nh_encap_type; u32 nlflags; struct nl_info nlinfo; }; struct nh_info { struct hlist_node dev_hash; /* entry on netns devhash */ struct nexthop *nh_parent; u8 family; bool reject_nh; bool fdb_nh; union { struct fib_nh_common fib_nhc; struct fib_nh fib_nh; struct fib6_nh fib6_nh; }; }; struct nh_grp_entry { struct nexthop *nh; u8 weight; atomic_t upper_bound; struct list_head nh_list; struct nexthop *nh_parent; /* nexthop of group with this entry */ }; struct nh_group { struct nh_group *spare; /* spare group for removals */ u16 num_nh; bool mpath; bool fdb_nh; bool has_v4; struct nh_grp_entry nh_entries[]; }; struct nexthop { struct rb_node rb_node; /* entry on netns rbtree */ struct list_head fi_list; /* v4 entries using nh */ struct list_head f6i_list; /* v6 entries using nh */ struct list_head fdb_list; /* fdb entries using this nh */ struct list_head grp_list; /* nh group entries using this nh */ struct net *net; u32 id; u8 protocol; /* app managing this nh */ u8 nh_flags; bool is_group; refcount_t refcnt; struct rcu_head rcu; union { struct nh_info __rcu *nh_info; struct nh_group __rcu *nh_grp; }; }; enum nexthop_event_type { NEXTHOP_EVENT_DEL }; int register_nexthop_notifier(struct net *net, struct notifier_block *nb); int unregister_nexthop_notifier(struct net *net, struct notifier_block *nb); /* caller is holding rcu or rtnl; no reference taken to nexthop */ struct nexthop *nexthop_find_by_id(struct net *net, u32 id); void nexthop_free_rcu(struct rcu_head *head); static inline bool nexthop_get(struct nexthop *nh) { return refcount_inc_not_zero(&nh->refcnt); } static inline void nexthop_put(struct nexthop *nh) { if (refcount_dec_and_test(&nh->refcnt)) call_rcu(&nh->rcu, nexthop_free_rcu); } static inline bool nexthop_cmp(const struct nexthop *nh1, const struct nexthop *nh2) { return nh1 == nh2; } static inline bool nexthop_is_fdb(const struct nexthop *nh) { if (nh->is_group) { const struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->fdb_nh; } else { const struct nh_info *nhi; nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->fdb_nh; } } static inline bool nexthop_has_v4(const struct nexthop *nh) { if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->has_v4; } return false; } static inline bool nexthop_is_multipath(const struct nexthop *nh) { if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); return nh_grp->mpath; } return false; } struct nexthop *nexthop_select_path(struct nexthop *nh, int hash); static inline unsigned int nexthop_num_path(const struct nexthop *nh) { unsigned int rc = 1; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->mpath) rc = nh_grp->num_nh; } return rc; } static inline struct nexthop *nexthop_mpath_select(const struct nh_group *nhg, int nhsel) { /* for_nexthops macros in fib_semantics.c grabs a pointer to * the nexthop before checking nhsel */ if (nhsel >= nhg->num_nh) return NULL; return nhg->nh_entries[nhsel].nh; } static inline int nexthop_mpath_fill_node(struct sk_buff *skb, struct nexthop *nh, u8 rt_family) { struct nh_group *nhg = rtnl_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; struct nh_info *nhi = rcu_dereference_rtnl(nhe->nh_info); struct fib_nh_common *nhc = &nhi->fib_nhc; int weight = nhg->nh_entries[i].weight; if (fib_add_nexthop(skb, nhc, weight, rt_family, 0) < 0) return -EMSGSIZE; } return 0; } /* called with rcu lock */ static inline bool nexthop_is_blackhole(const struct nexthop *nh) { const struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->num_nh > 1) return false; nh = nh_grp->nh_entries[0].nh; } nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->reject_nh; } static inline void nexthop_path_fib_result(struct fib_result *res, int hash) { struct nh_info *nhi; struct nexthop *nh; nh = nexthop_select_path(res->fi->nh, hash); nhi = rcu_dereference(nh->nh_info); res->nhc = &nhi->fib_nhc; } /* called with rcu read lock or rtnl held */ static inline struct fib_nh_common *nexthop_fib_nhc(struct nexthop *nh, int nhsel) { struct nh_info *nhi; BUILD_BUG_ON(offsetof(struct fib_nh, nh_common) != 0); BUILD_BUG_ON(offsetof(struct fib6_nh, nh_common) != 0); if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); if (nh_grp->mpath) { nh = nexthop_mpath_select(nh_grp, nhsel); if (!nh) return NULL; } } nhi = rcu_dereference_rtnl(nh->nh_info); return &nhi->fib_nhc; } /* called from fib_table_lookup with rcu_lock */ static inline struct fib_nh_common *nexthop_get_nhc_lookup(const struct nexthop *nh, int fib_flags, const struct flowi4 *flp, int *nhsel) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nhg = rcu_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; nhi = rcu_dereference(nhe->nh_info); if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { *nhsel = i; return &nhi->fib_nhc; } } } else { nhi = rcu_dereference(nh->nh_info); if (fib_lookup_good_nhc(&nhi->fib_nhc, fib_flags, flp)) { *nhsel = 0; return &nhi->fib_nhc; } } return NULL; } static inline bool nexthop_uses_dev(const struct nexthop *nh, const struct net_device *dev) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nhg = rcu_dereference(nh->nh_grp); int i; for (i = 0; i < nhg->num_nh; i++) { struct nexthop *nhe = nhg->nh_entries[i].nh; nhi = rcu_dereference(nhe->nh_info); if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) return true; } } else { nhi = rcu_dereference(nh->nh_info); if (nhc_l3mdev_matches_dev(&nhi->fib_nhc, dev)) return true; } return false; } static inline unsigned int fib_info_num_path(const struct fib_info *fi) { if (unlikely(fi->nh)) return nexthop_num_path(fi->nh); return fi->fib_nhs; } int fib_check_nexthop(struct nexthop *nh, u8 scope, struct netlink_ext_ack *extack); static inline struct fib_nh_common *fib_info_nhc(struct fib_info *fi, int nhsel) { if (unlikely(fi->nh)) return nexthop_fib_nhc(fi->nh, nhsel); return &fi->fib_nh[nhsel].nh_common; } /* only used when fib_nh is built into fib_info */ static inline struct fib_nh *fib_info_nh(struct fib_info *fi, int nhsel) { WARN_ON(fi->nh); return &fi->fib_nh[nhsel]; } /* * IPv6 variants */ int fib6_check_nexthop(struct nexthop *nh, struct fib6_config *cfg, struct netlink_ext_ack *extack); /* Caller should either hold rcu_read_lock(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh(struct nexthop *nh) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_rtnl(nh->nh_grp); nh = nexthop_mpath_select(nh_grp, 0); if (!nh) return NULL; } nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->family == AF_INET6) return &nhi->fib6_nh; return NULL; } /* Variant of nexthop_fib6_nh(). * Caller should either hold rcu_read_lock_bh(), or RTNL. */ static inline struct fib6_nh *nexthop_fib6_nh_bh(struct nexthop *nh) { struct nh_info *nhi; if (nh->is_group) { struct nh_group *nh_grp; nh_grp = rcu_dereference_bh_rtnl(nh->nh_grp); nh = nexthop_mpath_select(nh_grp, 0); if (!nh) return NULL; } nhi = rcu_dereference_bh_rtnl(nh->nh_info); if (nhi->family == AF_INET6) return &nhi->fib6_nh; return NULL; } static inline struct net_device *fib6_info_nh_dev(struct fib6_info *f6i) { struct fib6_nh *fib6_nh; fib6_nh = f6i->nh ? nexthop_fib6_nh(f6i->nh) : f6i->fib6_nh; return fib6_nh->fib_nh_dev; } static inline void nexthop_path_fib6_result(struct fib6_result *res, int hash) { struct nexthop *nh = res->f6i->nh; struct nh_info *nhi; nh = nexthop_select_path(nh, hash); nhi = rcu_dereference_rtnl(nh->nh_info); if (nhi->reject_nh) { res->fib6_type = RTN_BLACKHOLE; res->fib6_flags |= RTF_REJECT; res->nh = nexthop_fib6_nh(nh); } else { res->nh = &nhi->fib6_nh; } } int nexthop_for_each_fib6_nh(struct nexthop *nh, int (*cb)(struct fib6_nh *nh, void *arg), void *arg); static inline int nexthop_get_family(struct nexthop *nh) { struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); return nhi->family; } static inline struct fib_nh_common *nexthop_fdb_nhc(struct nexthop *nh) { struct nh_info *nhi = rcu_dereference_rtnl(nh->nh_info); return &nhi->fib_nhc; } static inline struct fib_nh_common *nexthop_path_fdb_result(struct nexthop *nh, int hash) { struct nh_info *nhi; struct nexthop *nhp; nhp = nexthop_select_path(nh, hash); if (unlikely(!nhp)) return NULL; nhi = rcu_dereference(nhp->nh_info); return &nhi->fib_nhc; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0+ */ /* * RCU-based infrastructure for lightweight reader-writer locking * * Copyright (c) 2015, Red Hat, Inc. * * Author: Oleg Nesterov <oleg@redhat.com> */ #ifndef _LINUX_RCU_SYNC_H_ #define _LINUX_RCU_SYNC_H_ #include <linux/wait.h> #include <linux/rcupdate.h> /* Structure to mediate between updaters and fastpath-using readers. */ struct rcu_sync { int gp_state; int gp_count; wait_queue_head_t gp_wait; struct rcu_head cb_head; }; /** * rcu_sync_is_idle() - Are readers permitted to use their fastpaths? * @rsp: Pointer to rcu_sync structure to use for synchronization * * Returns true if readers are permitted to use their fastpaths. Must be * invoked within some flavor of RCU read-side critical section. */ static inline bool rcu_sync_is_idle(struct rcu_sync *rsp) { RCU_LOCKDEP_WARN(!rcu_read_lock_any_held(), "suspicious rcu_sync_is_idle() usage"); return !READ_ONCE(rsp->gp_state); /* GP_IDLE */ } extern void rcu_sync_init(struct rcu_sync *); extern void rcu_sync_enter_start(struct rcu_sync *); extern void rcu_sync_enter(struct rcu_sync *); extern void rcu_sync_exit(struct rcu_sync *); extern void rcu_sync_dtor(struct rcu_sync *); #define __RCU_SYNC_INITIALIZER(name) { \ .gp_state = 0, \ .gp_count = 0, \ .gp_wait = __WAIT_QUEUE_HEAD_INITIALIZER(name.gp_wait), \ } #define DEFINE_RCU_SYNC(name) \ struct rcu_sync name = __RCU_SYNC_INITIALIZER(name) #endif /* _LINUX_RCU_SYNC_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 /* SPDX-License-Identifier: GPL-2.0 */ /* * memory buffer pool support */ #ifndef _LINUX_MEMPOOL_H #define _LINUX_MEMPOOL_H #include <linux/wait.h> #include <linux/compiler.h> struct kmem_cache; typedef void * (mempool_alloc_t)(gfp_t gfp_mask, void *pool_data); typedef void (mempool_free_t)(void *element, void *pool_data); typedef struct mempool_s { spinlock_t lock; int min_nr; /* nr of elements at *elements */ int curr_nr; /* Current nr of elements at *elements */ void **elements; void *pool_data; mempool_alloc_t *alloc; mempool_free_t *free; wait_queue_head_t wait; } mempool_t; static inline bool mempool_initialized(mempool_t *pool) { return pool->elements != NULL; } void mempool_exit(mempool_t *pool); int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int node_id); int mempool_init(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); extern mempool_t *mempool_create(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data); extern mempool_t *mempool_create_node(int min_nr, mempool_alloc_t *alloc_fn, mempool_free_t *free_fn, void *pool_data, gfp_t gfp_mask, int nid); extern int mempool_resize(mempool_t *pool, int new_min_nr); extern void mempool_destroy(mempool_t *pool); extern void *mempool_alloc(mempool_t *pool, gfp_t gfp_mask) __malloc; extern void mempool_free(void *element, mempool_t *pool); /* * A mempool_alloc_t and mempool_free_t that get the memory from * a slab cache that is passed in through pool_data. * Note: the slab cache may not have a ctor function. */ void *mempool_alloc_slab(gfp_t gfp_mask, void *pool_data); void mempool_free_slab(void *element, void *pool_data); static inline int mempool_init_slab_pool(mempool_t *pool, int min_nr, struct kmem_cache *kc) { return mempool_init(pool, min_nr, mempool_alloc_slab, mempool_free_slab, (void *) kc); } static inline mempool_t * mempool_create_slab_pool(int min_nr, struct kmem_cache *kc) { return mempool_create(min_nr, mempool_alloc_slab, mempool_free_slab, (void *) kc); } /* * a mempool_alloc_t and a mempool_free_t to kmalloc and kfree the * amount of memory specified by pool_data */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data); void mempool_kfree(void *element, void *pool_data); static inline int mempool_init_kmalloc_pool(mempool_t *pool, int min_nr, size_t size) { return mempool_init(pool, min_nr, mempool_kmalloc, mempool_kfree, (void *) size); } static inline mempool_t *mempool_create_kmalloc_pool(int min_nr, size_t size) { return mempool_create(min_nr, mempool_kmalloc, mempool_kfree, (void *) size); } /* * A mempool_alloc_t and mempool_free_t for a simple page allocator that * allocates pages of the order specified by pool_data */ void *mempool_alloc_pages(gfp_t gfp_mask, void *pool_data); void mempool_free_pages(void *element, void *pool_data); static inline int mempool_init_page_pool(mempool_t *pool, int min_nr, int order) { return mempool_init(pool, min_nr, mempool_alloc_pages, mempool_free_pages, (void *)(long)order); } static inline mempool_t *mempool_create_page_pool(int min_nr, int order) { return mempool_create(min_nr, mempool_alloc_pages, mempool_free_pages, (void *)(long)order); } #endif /* _LINUX_MEMPOOL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP module. * * Version: @(#)tcp.h 1.0.5 05/23/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _TCP_H #define _TCP_H #define FASTRETRANS_DEBUG 1 #include <linux/list.h> #include <linux/tcp.h> #include <linux/bug.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> #include <net/sock_reuseport.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/bpf-cgroup.h> #include <linux/siphash.h> extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 #define TCP_MIN_SND_MSS 48 #define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) /* * Never offer a window over 32767 without using window scaling. Some * poor stacks do signed 16bit maths! */ #define MAX_TCP_WINDOW 32767U /* Minimal accepted MSS. It is (60+60+8) - (20+20). */ #define TCP_MIN_MSS 88U /* The initial MTU to use for probing */ #define TCP_BASE_MSS 1024 /* probing interval, default to 10 minutes as per RFC4821 */ #define TCP_PROBE_INTERVAL 600 /* Specify interval when tcp mtu probing will stop */ #define TCP_PROBE_THRESHOLD 8 /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 /* Maximal number of ACKs sent quickly to accelerate slow-start. */ #define TCP_MAX_QUICKACKS 16U /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 #define TCP_URG_READ 0x0400 #define TCP_RETR1 3 /* * This is how many retries it does before it * tries to figure out if the gateway is * down. Minimal RFC value is 3; it corresponds * to ~3sec-8min depending on RTO. */ #define TCP_RETR2 15 /* * This should take at least * 90 minutes to time out. * RFC1122 says that the limit is 100 sec. * 15 is ~13-30min depending on RTO. */ #define TCP_SYN_RETRIES 6 /* This is how many retries are done * when active opening a connection. * RFC1122 says the minimum retry MUST * be at least 180secs. Nevertheless * this value is corresponding to * 63secs of retransmission with the * current initial RTO. */ #define TCP_SYNACK_RETRIES 5 /* This is how may retries are done * when passive opening a connection. * This is corresponding to 31secs of * retransmission with the current * initial RTO. */ #define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN /* BSD style FIN_WAIT2 deadlock breaker. * It used to be 3min, new value is 60sec, * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ #if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) #else #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS. */ #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) #define MAX_TCP_KEEPIDLE 32767 #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN * to provide reliability equal to one * provided by timewait state. */ #define TCP_PAWS_WINDOW 1 /* Replay window for per-host * timestamps. It must be less than * minimal timewait lifetime. */ /* * TCP option */ #define TCPOPT_NOP 1 /* Padding */ #define TCPOPT_EOL 0 /* End of options */ #define TCPOPT_MSS 2 /* Segment size negotiating */ #define TCPOPT_WINDOW 3 /* Window scaling */ #define TCPOPT_SACK_PERM 4 /* SACK Permitted */ #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths */ #define TCPOLEN_MSS 4 #define TCPOLEN_WINDOW 3 #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 #define TCPOLEN_WSCALE_ALIGNED 4 #define TCPOLEN_SACKPERM_ALIGNED 4 #define TCPOLEN_SACK_BASE 2 #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ /* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ /* TCP initial congestion window as per rfc6928 */ #define TCP_INIT_CWND 10 /* Bit Flags for sysctl_tcp_fastopen */ #define TFO_CLIENT_ENABLE 1 #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ /* Accept SYN data w/o any cookie option */ #define TFO_SERVER_COOKIE_NOT_REQD 0x200 /* Force enable TFO on all listeners, i.e., not requiring the * TCP_FASTOPEN socket option. */ #define TFO_SERVER_WO_SOCKOPT1 0x400 /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). */ static inline bool before(__u32 seq1, __u32 seq2) { return (__s32)(seq1-seq2) < 0; } #define after(seq2, seq1) before(seq1, seq2) /* is s2<=s1<=s3 ? */ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) { return seq3 - seq2 >= seq1 - seq2; } static inline bool tcp_out_of_memory(struct sock *sk) { if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) return true; return false; } void sk_forced_mem_schedule(struct sock *sk, int size); bool tcp_check_oom(struct sock *sk, int shift); extern struct proto tcp_prot; #define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) void tcp_tasklet_init(void); int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ icsk->icsk_ack.ato = TCP_ATO_MIN; } else icsk->icsk_ack.quick -= pkts; } } #define TCP_ECN_OK 1 #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, TCP_TW_SYN = 3 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); #endif void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* * BPF SKB-less helpers */ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb); void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, TCP_SYNACK_FASTOPEN, TCP_SYNACK_COOKIE, }; struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, u32 tsoff); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. * This counter is used both as a hash input and partially encoded into * the cookie value. A cookie is only validated further if the delta * between the current counter value and the encoded one is less than this, * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if * the counter advances immediately after a cookie is generated). */ #define MAX_SYNCOOKIE_AGE 2 #define TCP_SYNCOOKIE_PERIOD (60 * HZ) #define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD) /* syncookies: remember time of last synqueue overflow * But do not dirty this field too often (once per second is enough) * It is racy as we do not hold a lock, but race is very minor. */ static inline void tcp_synq_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(reuse->synq_overflow_ts, now); return; } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use * 'last_overflow - HZ' as lower bound. That's because a concurrent * tcp_synq_overflow() could update .ts_recent_stamp after we read * jiffies but before we store .ts_recent_stamp into last_overflow, * which could lead to rejecting a valid syncookie. */ return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) { u64 val = get_jiffies_64(); do_div(val, TCP_SYNCOOKIE_PERIOD); return val; } u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, const struct net *net, const struct dst_entry *dst); /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); enum tcp_queue { TCP_FRAG_IN_WRITE_QUEUE, TCP_FRAG_IN_RTX_QUEUE, }; int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) __sock_put(sk); if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) __sock_put(sk); inet_csk_clear_xmit_timers(sk); } unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); unsigned int tcp_current_mss(struct sock *sk); u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { int cutoff; /* When peer uses tiny windows, there is no use in packetizing * to sub-MSS pieces for the sake of SWS or making sure there * are enough packets in the pipe for fast recovery. * * On the other hand, for extremely large MSS devices, handling * smaller than MSS windows in this way does make sense. */ if (tp->max_window > TCP_MSS_DEFAULT) cutoff = (tp->max_window >> 1); else cutoff = tp->max_window; if (cutoff && pktsize > cutoff) return max_t(int, cutoff, 68U - tp->tcp_header_len); else return pktsize; } /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } static inline u32 __tcp_set_rto(const struct tcp_sock *tp) { return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) tcp_fast_path_on(tp); } /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } static inline u32 tcp_rto_min_us(struct sock *sk) { return jiffies_to_usecs(tcp_rto_min(sk)); } static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) { return dst_metric_locked(dst, RTAX_CC_ALGO); } /* Minimum RTT in usec. ~0 means not available. */ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) { return minmax_get(&tp->rtt_min); } /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; if (win < 0) win = 0; return (u32) win; } /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. */ u32 __tcp_select_window(struct sock *sk); void tcp_send_window_probe(struct sock *sk); /* TCP uses 32bit jiffies to save some space. * Note that this is different from tcp_time_stamp, which * historically has been the same until linux-4.13. */ #define tcp_jiffies32 ((u32)jiffies) /* * Deliver a 32bit value for TCP timestamp option (RFC 7323) * It is no longer tied to jiffies, but to 1 ms clock. * Note: double check if you want to use tcp_jiffies32 instead of this. */ #define TCP_TS_HZ 1000 static inline u64 tcp_clock_ns(void) { return ktime_get_ns(); } static inline u64 tcp_clock_us(void) { return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } /* This should only be used in contexts where tp->tcp_mstamp is up to date */ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) { return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ static inline u32 tcp_ns_to_ts(u64 ns) { return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); } /* Could use tcp_clock_us() / 1000, but this version uses a single divide */ static inline u32 tcp_time_stamp_raw(void) { return tcp_ns_to_ts(tcp_clock_ns()); } void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) { return max_t(s64, t1 - t0, 0); } static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); } /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN 0x01 #define TCPHDR_SYN 0x02 #define TCPHDR_RST 0x04 #define TCPHDR_PSH 0x08 #define TCPHDR_ACK 0x10 #define TCPHDR_URG 0x20 #define TCPHDR_ECE 0x40 #define TCPHDR_CWR 0x80 #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. * This is 44 bytes if IPV6 is enabled. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { /* Note : tcp_tw_isn is used in input path only * (isn chosen by tcp_timewait_state_process()) * * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ #define TCPCB_LOST 0x04 /* SKB is lost */ #define TCPCB_TAGBITS 0x07 /* All tag bits */ #define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ TCPCB_REPAIRED) __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ unused:5; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { /* There is space for up to 24 bytes */ __u32 in_flight:30,/* Bytes in flight at transmit */ is_app_limited:1, /* cwnd not fully used? */ unused:1; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ u64 first_tx_mstamp; /* when we reached the "delivered" count */ u64 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ struct { __u32 flags; struct sock *sk_redir; void *data_end; } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) { TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); } static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; } static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.sk_redir; } static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) { TCP_SKB_CB(skb)->bpf.sk_redir = NULL; } extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] */ static inline int tcp_v6_iif(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->header.h6.iif; } static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) return TCP_SKB_CB(skb)->header.h6.iif; #endif return 0; } extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb)); #endif /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return TCP_SKB_CB(skb)->header.h4.iif; #endif return 0; } /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_segs; } static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs = segs; } static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_size; } static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) { return likely(!TCP_SKB_CB(skb)->eor); } static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from)); } /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* * Interface for adding new TCP congestion control handlers */ #define TCP_CA_NAME_MAX 16 #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; struct ack_sample { u32 pkts_acked; s32 rtt_us; u32 in_flight; }; /* A rate sample measures the number of (original/retransmitted) data * packets delivered "delivered" over an interval of time "interval_us". * The tcp_rate.c code fills in the rate sample, and congestion * control modules that define a cong_control function to run at the end * of ACK processing can optionally chose to consult this sample when * setting cwnd and pacing rate. * A sample is invalid if "delivered" or "interval_us" is negative. */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { struct list_head list; u32 key; u32 flags; /* initialize private data (optional) */ void (*init)(struct sock *sk); /* cleanup private data (optional) */ void (*release)(struct sock *sk); /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); /* new value of cwnd after loss (required) */ u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, const struct rate_sample *rs); /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); char name[TCP_CA_NAME_MAX]; struct module *owner; }; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(struct net *net, const char *name); void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); u32 tcp_reno_ssthresh(struct sock *sk); u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) { return NULL; } #endif static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cwnd_event) icsk->icsk_ca_ops->cwnd_event(sk, event); } /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK */ static inline int tcp_is_sack(const struct tcp_sock *tp) { return likely(tp->rx_opt.sack_ok); } static inline bool tcp_is_reno(const struct tcp_sock *tp) { return !tcp_is_sack(tp); } static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK * blocks etc.) we can make more aggressive calculations. * * Use this for decisions involving congestion control, use just * tp->packets_out to determine if the send queue is empty or not. * * Read this equation as: * * "Packets sent once on transmission queue" MINUS * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } #define TCP_INFINITE_SSTHRESH 0x7fffffff static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tp->snd_cwnd < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; } static inline bool tcp_in_cwnd_reduction(const struct sock *sk) { return (TCPF_CA_CWR | TCPF_CA_Recovery) & (1 << inet_csk(sk)->icsk_ca_state); } /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is cwnd reduction phase, when cwnd is decreasing towards * ssthresh. */ static inline __u32 tcp_current_ssthresh(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tcp_in_cwnd_reduction(sk)) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, ((tp->snd_cwnd >> 1) + (tp->snd_cwnd >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) void tcp_enter_cwr(struct sock *sk); __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst); /* The maximum number of MSS of available cwnd for which TSO defers * sending if not using sysctl_tcp_tso_win_divisor. */ static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp) { return 3; } /* Returns end sequence number of the receiver's advertised window */ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) { return tp->snd_una + tp->snd_wnd; } /* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless * it was fully used previously. And that's exactly what we do in * congestion avoidance mode. But in slow start we allow cwnd to grow * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. * This check is safe because it's as aggressive as slow start which already * risks 100% overshoot. The advantage is that we discourage application to * either send more filler packets or data to artificially blow up the cwnd * usage, and allow application-limited process to probe bw more aggressively. */ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) return tp->snd_cwnd < 2 * tp->max_packets_out; return tp->is_cwnd_limited; } /* BBR congestion control needs pacing. * Same remark for SO_MAX_PACING_RATE. * sch_fq packet scheduler is efficiently handling pacing, * but is not always installed/used. * Return true if TCP stack should pace packets itself. */ static inline bool tcp_needs_internal_pacing(const struct sock *sk) { return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; } /* Estimates in how many jiffies next packet for this flow can be sent. * Scheduling a retransmit timer too early would be silly. */ static inline unsigned long tcp_pacing_delay(const struct sock *sk) { s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache; return delay > 0 ? nsecs_to_jiffies(delay) : 0; } static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when) { inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), max_when); } /* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window, or we are paced. * We do not want to add fuel to the fire, or abort too early, * so make sure the timer we arm now is at least 200ms in the future, * regardless of current icsk_rto value (as it could be ~2ms) */ static inline unsigned long tcp_probe0_base(const struct sock *sk) { return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN); } /* Variant of inet_csk_rto_backoff() used for zero window probes */ static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff; return (unsigned long)min_t(u64, when, max_when); } static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } /* * Calculate(/check) TCP checksum */ static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline bool tcp_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __skb_checksum_complete(skb); } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->num_sacks = 0; } void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); s32 delta; if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) tcp_cwnd_restart(sk, delta); } /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); static inline int tcp_win_from_space(const struct sock *sk, int space) { int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale; return tcp_adv_win_scale <= 0 ? (space>>(-tcp_adv_win_scale)) : space - (space>>tcp_adv_win_scale); } /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } void tcp_cleanup_rbuf(struct sock *sk, int copied); /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override * SO_RCVLOWAT constraint, since we are receiving skbs with too small * len/truesize ratio. */ static inline bool tcp_rmem_pressure(const struct sock *sk) { int rcvbuf, threshold; if (tcp_under_memory_pressure(sk)) return true; rcvbuf = READ_ONCE(sk->sk_rcvbuf); threshold = rcvbuf - (rcvbuf >> 3); return atomic_read(&sk->sk_rmem_alloc) > threshold; } extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); void tcp_enter_memory_pressure(struct sock *sk); void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl; } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes; } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) { const struct inet_connection_sock *icsk = &tp->inet_conn; return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, tcp_jiffies32 - tp->rcv_tstamp); } static inline int tcp_fin_time(const struct sock *sk) { int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, int paws_win) { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, * then following tcp messages have valid values. Ignore 0 value, * or else 'negative' tsval might forbid us to accept their packets. */ if (!rx_opt->ts_recent) return true; return false; } static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, int rst) { if (tcp_paws_check(rx_opt, 0)) return false; /* RST segments are not recommended to carry timestamp, and, if they do, it is recommended to ignore PAWS because "their cleanup function should take precedence over timestamps." Certainly, it is mistake. It is necessary to understand the reasons of this constraint to relax it: if peer reboots, clock may go out-of-sync and half-open connections will not be reset. Actually, the problem would be not existing if all the implementations followed draft about maintaining clock via reboots. Linux-2.2 DOES NOT! However, we can relax time bounds for RST segments to MSL. */ if (rst && !time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); static inline void tcp_mib_init(struct net *net) { /* See RFC 2012 */ TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1); TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1); } /* from STCP */ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { tp->lost_skb_hint = NULL; } static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } union tcp_md5_addr { struct in_addr a4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr a6; #endif }; /* - key database */ struct tcp_md5sig_key { struct hlist_node node; u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; /* - sock block */ struct tcp_md5sig_info { struct hlist_head head; struct rcu_head rcu; }; /* - pseudo header */ struct tcp4_pseudohdr { __be32 saddr; __be32 daddr; __u8 pad; __u8 protocol; __be16 len; }; struct tcp6_pseudohdr { struct in6_addr saddr; struct in6_addr daddr; __be32 len; __be32 protocol; /* including padding */ }; union tcp_md5sum_block { struct tcp4_pseudohdr ip4; #if IS_ENABLED(CONFIG_IPV6) struct tcp6_pseudohdr ip6; #endif }; /* - pool: digest algorithm, hash description and scratch buffer */ struct tcp_md5sig_pool { struct ahash_request *md5_req; void *scratch; }; /* - functions */ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG #include <linux/jump_label.h> extern struct static_key_false tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family); static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { return NULL; } #define tcp_twsk_md5_key(twsk) NULL #endif bool tcp_alloc_md5sig_pool(void); struct tcp_md5sig_pool *tcp_get_md5sig_pool(void); static inline void tcp_put_md5sig_pool(void) { local_bh_enable(); } int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *, unsigned int header_len); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); struct tcp_fastopen_request { /* Fast Open cookie. Size 0 means a cookie request */ struct tcp_fastopen_cookie cookie; struct msghdr *data; /* data in MSG_FASTOPEN */ size_t size; int copied; /* queued in tcp_connect() */ struct ubuf_info *uarg; }; void tcp_free_fastopen_req(struct tcp_sock *tp); void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key); int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst); void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); #define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t) #define TCP_FASTOPEN_KEY_MAX 2 #define TCP_FASTOPEN_KEY_BUF_LENGTH \ (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX) /* Fastopen key context */ struct tcp_fastopen_context { siphash_key_t key[TCP_FASTOPEN_KEY_MAX]; int num; struct rcu_head rcu; }; extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); /* Caller needs to wrap with rcu_read_(un)lock() */ static inline struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); if (!ctx) ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); return ctx; } static inline bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc, const struct tcp_fastopen_cookie *orig) { if (orig->len == TCP_FASTOPEN_COOKIE_SIZE && orig->len == foc->len && !memcmp(orig->val, foc->val, foc->len)) return true; return false; } static inline int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx) { return ctx->num; } /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ enum tcp_chrono { TCP_CHRONO_UNSPEC, TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */ TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */ TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */ __TCP_CHRONO_MAX, }; void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); /* This helper is needed, because skb->tcp_tsorted_anchor uses * the same memory storage than skb->destructor/_skb_refdst */ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) { skb->destructor = NULL; skb->_skb_refdst = 0UL; } #define tcp_skb_tsorted_save(skb) { \ unsigned long _save = skb->_skb_refdst; \ skb->_skb_refdst = 0UL; #define tcp_skb_tsorted_restore(skb) \ skb->_skb_refdst = _save; \ } void tcp_write_queue_purge(struct sock *sk); static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) { return skb_rb_first(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { return skb_rb_last(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); } static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); } #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, const struct sk_buff *skb) { return skb_queue_is_last(&sk->sk_write_queue, skb); } /** * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue * @sk: socket * * Since the write queue can have a temporary empty skb in it, * we must not use "return skb_queue_empty(&sk->sk_write_queue)" */ static inline bool tcp_write_queue_empty(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) { return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); } static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) { return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); /* Queue it, remembering where we must start sending. */ if (sk->sk_write_queue.next == skb) tcp_chrono_start(sk, TCP_CHRONO_BUSY); } /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); } static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); sk_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) { if (tcp_send_head(sk)) { struct tcp_sock *tp = tcp_sk(sk); __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle); } } /* Start sequence of the skb just after the highest skb with SACKed * bit, valid only if sacked_out > 0 or when the caller has ensured * validity by itself. */ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) { if (!tp->sacked_out) return tp->snd_una; if (tp->highest_sack == NULL) return tp->snd_nxt; return TCP_SKB_CB(tp->highest_sack)->seq; } static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { tcp_sk(sk)->highest_sack = skb_rb_next(skb); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) { return tcp_sk(sk)->highest_sack; } static inline void tcp_highest_sack_reset(struct sock *sk) { tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk); } /* Called when old skb is about to be deleted and replaced by new skb */ static inline void tcp_highest_sack_replace(struct sock *sk, struct sk_buff *old, struct sk_buff *new) { if (old == tcp_highest_sack(sk)) tcp_sk(sk)->highest_sack = new; } /* This helper checks if socket has IP_TRANSPARENT set */ static inline bool inet_sk_transparent(const struct sock *sk) { switch (sk->sk_state) { case TCP_TIME_WAIT: return inet_twsk(sk)->tw_transparent; case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } return inet_sk(sk)->transparent; } /* Determines whether this is a thin stream (which may suffer from * increased latency). Used to trigger latency-reducing mechanisms. */ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) { return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp); } /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_ESTABLISHED, }; void *tcp_seq_start(struct seq_file *seq, loff_t *pos); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void tcp_seq_stop(struct seq_file *seq, void *v); struct tcp_seq_afinfo { sa_family_t family; }; struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; struct tcp_seq_afinfo *bpf_seq_afinfo; int bucket, offset, sbucket, num; loff_t last_pos; }; extern struct request_sock_ops tcp_request_sock_ops; extern struct request_sock_ops tcp6_request_sock_ops; void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); int tcp_gro_complete(struct sk_buff *skb); void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; } /* @wake is one when sk_stream_write_space() calls us. * This sends EPOLLOUT only if notsent_bytes is half the limit. * This mimics the strategy used in sock_def_write_space(). */ static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); #endif int tcp_rtx_synack(const struct sock *sk, struct request_sock *req); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); /* TCP af-specific functions */ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); int (*md5_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); #endif }; struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); #endif void (*init_req)(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, const struct request_sock *req); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; #if IS_ENABLED(CONFIG_IPV6) extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; #endif #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { tcp_synq_overflow(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return ops->cookie_init_seq(skb, mss); } #else static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { return 0; } #endif int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } /* * Save and compile IPv4 options, return a pointer to it */ static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, struct sk_buff *skb) { const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { kfree(dopt); dopt = NULL; } } return dopt; } /* locally generated TCP pure ACKs have skb->truesize == 2 * (check tcp_send_ack() in net/ipv4/tcp_output.c ) * This is much faster than dissecting the packet to find out. * (Think of GRE encapsulations, IPv4, IPv6, ...) */ static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb) { return skb->truesize == 2; } static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) { skb->truesize = 2; } static inline int tcp_inq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int answ; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { answ = 0; } else if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { answ = tp->rcv_nxt - tp->copied_seq; /* Subtract 1, if FIN was received */ if (answ && sock_flag(sk, SOCK_DONE)) answ--; } else { answ = tp->urg_seq - tp->copied_seq; } return answ; } int tcp_peek_len(struct socket *sock); static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) { u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); tp->segs_in += segs_in; if (skb->len > tcp_hdrlen(skb)) tp->data_segs_in += segs_in; } /* * TCP listen path runs lockless. * We forced "struct sock" to be const qualified to make sure * we don't modify one of its field by mistake. * Here, we increment sk_drops which is an atomic_t, so we can safely * make sock writable again. */ static inline void tcp_listendrop(const struct sock *sk) { atomic_inc(&((struct sock *)sk)->sk_drops); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); /* * Interface for adding Upper Level Protocols over TCP */ #define TCP_ULP_NAME_MAX 16 #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) struct tcp_ulp_ops { struct list_head list; /* initialize ulp */ int (*init)(struct sock *sk); /* update ulp */ void (*update)(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); /* cleanup ulp */ void (*release)(struct sock *sk); /* diagnostic */ int (*get_info)(const struct sock *sk, struct sk_buff *skb); size_t (*get_info_size)(const struct sock *sk); /* clone ulp */ void (*clone)(const struct request_sock *req, struct sock *newsk, const gfp_t priority); char name[TCP_ULP_NAME_MAX]; struct module *owner; }; int tcp_register_ulp(struct tcp_ulp_ops *type); void tcp_unregister_ulp(struct tcp_ulp_ops *type); int tcp_set_ulp(struct sock *sk, const char *name); void tcp_get_available_ulp(char *buf, size_t len); void tcp_cleanup_ulp(struct sock *sk); void tcp_update_ulp(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); #define MODULE_ALIAS_TCP_ULP(name) \ __MODULE_INFO(alias, alias_userspace, name); \ __MODULE_INFO(alias, alias_tcp_ulp, "tcp-ulp-" name) struct sk_msg; struct sk_psock; #ifdef CONFIG_BPF_STREAM_PARSER struct proto *tcp_bpf_get_proto(struct sock *sk, struct sk_psock *psock); void tcp_bpf_clone(const struct sock *sk, struct sock *newsk); #else static inline void tcp_bpf_clone(const struct sock *sk, struct sock *newsk) { } #endif /* CONFIG_BPF_STREAM_PARSER */ #ifdef CONFIG_NET_SOCK_MSG int tcp_bpf_sendmsg_redir(struct sock *sk, struct sk_msg *msg, u32 bytes, int flags); int __tcp_bpf_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, int len, int flags); #endif /* CONFIG_NET_SOCK_MSG */ #ifdef CONFIG_CGROUP_BPF static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { skops->skb = skb; skops->skb_data_end = skb->data + end_offset; } #else static inline void bpf_skops_init_skb(struct bpf_sock_ops_kern *skops, struct sk_buff *skb, unsigned int end_offset) { } #endif /* Call BPF_SOCK_OPS program that returns an int. If the return value * is < 0, then the BPF op failed (for example if the loaded BPF * program does not support the chosen operation or there is no BPF * program loaded). */ #ifdef CONFIG_BPF static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { struct bpf_sock_ops_kern sock_ops; int ret; memset(&sock_ops, 0, offsetof(struct bpf_sock_ops_kern, temp)); if (sk_fullsock(sk)) { sock_ops.is_fullsock = 1; sock_owned_by_me(sk); } sock_ops.sk = sk; sock_ops.op = op; if (nargs > 0) memcpy(sock_ops.args, args, nargs * sizeof(*args)); ret = BPF_CGROUP_RUN_PROG_SOCK_OPS(&sock_ops); if (ret == 0) ret = sock_ops.reply; else ret = -1; return ret; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { u32 args[2] = {arg1, arg2}; return tcp_call_bpf(sk, op, 2, args); } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { u32 args[3] = {arg1, arg2, arg3}; return tcp_call_bpf(sk, op, 3, args); } #else static inline int tcp_call_bpf(struct sock *sk, int op, u32 nargs, u32 *args) { return -EPERM; } static inline int tcp_call_bpf_2arg(struct sock *sk, int op, u32 arg1, u32 arg2) { return -EPERM; } static inline int tcp_call_bpf_3arg(struct sock *sk, int op, u32 arg1, u32 arg2, u32 arg3) { return -EPERM; } #endif static inline u32 tcp_timeout_init(struct sock *sk) { int timeout; timeout = tcp_call_bpf(sk, BPF_SOCK_OPS_TIMEOUT_INIT, 0, NULL); if (timeout <= 0) timeout = TCP_TIMEOUT_INIT; return timeout; } static inline u32 tcp_rwnd_init_bpf(struct sock *sk) { int rwnd; rwnd = tcp_call_bpf(sk, BPF_SOCK_OPS_RWND_INIT, 0, NULL); if (rwnd < 0) rwnd = 0; return rwnd; } static inline bool tcp_bpf_ca_needs_ecn(struct sock *sk) { return (tcp_call_bpf(sk, BPF_SOCK_OPS_NEEDS_ECN, 0, NULL) == 1); } static inline void tcp_bpf_rtt(struct sock *sk) { if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RTT_CB_FLAG)) tcp_call_bpf(sk, BPF_SOCK_OPS_RTT_CB, 0, NULL); } #if IS_ENABLED(CONFIG_SMC) extern struct static_key_false tcp_have_smc; #endif #if IS_ENABLED(CONFIG_TLS_DEVICE) void clean_acked_data_enable(struct inet_connection_sock *icsk, void (*cad)(struct sock *sk, u32 ack_seq)); void clean_acked_data_disable(struct inet_connection_sock *icsk); void clean_acked_data_flush(void); #endif DECLARE_STATIC_KEY_FALSE(tcp_tx_delay_enabled); static inline void tcp_add_tx_delay(struct sk_buff *skb, const struct tcp_sock *tp) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) skb->skb_mstamp_ns += (u64)tp->tcp_tx_delay * NSEC_PER_USEC; } /* Compute Earliest Departure Time for some control packets * like ACK or RST for TIME_WAIT or non ESTABLISHED sockets. */ static inline u64 tcp_transmit_time(const struct sock *sk) { if (static_branch_unlikely(&tcp_tx_delay_enabled)) { u32 delay = (sk->sk_state == TCP_TIME_WAIT) ? tcp_twsk(sk)->tw_tx_delay : tcp_sk(sk)->tcp_tx_delay; return tcp_clock_ns() + (u64)delay * NSEC_PER_USEC; } return 0; } #endif /* _TCP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ /* * A security context is a set of security attributes * associated with each subject and object controlled * by the security policy. Security contexts are * externally represented as variable-length strings * that can be interpreted by a user or application * with an understanding of the security policy. * Internally, the security server uses a simple * structure. This structure is private to the * security server and can be changed without affecting * clients of the security server. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_CONTEXT_H_ #define _SS_CONTEXT_H_ #include "ebitmap.h" #include "mls_types.h" #include "security.h" /* * A security context consists of an authenticated user * identity, a role, a type and a MLS range. */ struct context { u32 user; u32 role; u32 type; u32 len; /* length of string in bytes */ struct mls_range range; char *str; /* string representation if context cannot be mapped. */ }; static inline void mls_context_init(struct context *c) { memset(&c->range, 0, sizeof(c->range)); } static inline int mls_context_cpy(struct context *dst, struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the low level of 'src'. */ static inline int mls_context_cpy_low(struct context *dst, struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the high level of 'src'. */ static inline int mls_context_cpy_high(struct context *dst, struct context *src) { int rc; dst->range.level[0].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } static inline int mls_context_glblub(struct context *dst, struct context *c1, struct context *c2) { struct mls_range *dr = &dst->range, *r1 = &c1->range, *r2 = &c2->range; int rc = 0; if (r1->level[1].sens < r2->level[0].sens || r2->level[1].sens < r1->level[0].sens) /* These ranges have no common sensitivities */ return -EINVAL; /* Take the greatest of the low */ dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens); /* Take the least of the high */ dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens); rc = ebitmap_and(&dr->level[0].cat, &r1->level[0].cat, &r2->level[0].cat); if (rc) goto out; rc = ebitmap_and(&dr->level[1].cat, &r1->level[1].cat, &r2->level[1].cat); if (rc) goto out; out: return rc; } static inline int mls_context_cmp(struct context *c1, struct context *c2) { return ((c1->range.level[0].sens == c2->range.level[0].sens) && ebitmap_cmp(&c1->range.level[0].cat, &c2->range.level[0].cat) && (c1->range.level[1].sens == c2->range.level[1].sens) && ebitmap_cmp(&c1->range.level[1].cat, &c2->range.level[1].cat)); } static inline void mls_context_destroy(struct context *c) { ebitmap_destroy(&c->range.level[0].cat); ebitmap_destroy(&c->range.level[1].cat); mls_context_init(c); } static inline void context_init(struct context *c) { memset(c, 0, sizeof(*c)); } static inline int context_cpy(struct context *dst, struct context *src) { int rc; dst->user = src->user; dst->role = src->role; dst->type = src->type; if (src->str) { dst->str = kstrdup(src->str, GFP_ATOMIC); if (!dst->str) return -ENOMEM; dst->len = src->len; } else { dst->str = NULL; dst->len = 0; } rc = mls_context_cpy(dst, src); if (rc) { kfree(dst->str); return rc; } return 0; } static inline void context_destroy(struct context *c) { c->user = c->role = c->type = 0; kfree(c->str); c->str = NULL; c->len = 0; mls_context_destroy(c); } static inline int context_cmp(struct context *c1, struct context *c2) { if (c1->len && c2->len) return (c1->len == c2->len && !strcmp(c1->str, c2->str)); if (c1->len || c2->len) return 0; return ((c1->user == c2->user) && (c1->role == c2->role) && (c1->type == c2->type) && mls_context_cmp(c1, c2)); } u32 context_compute_hash(const struct context *c); #endif /* _SS_CONTEXT_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/cpu.h - generic cpu definition * * This is mainly for topological representation. We define the * basic 'struct cpu' here, which can be embedded in per-arch * definitions of processors. * * Basic handling of the devices is done in drivers/base/cpu.c * * CPUs are exported via sysfs in the devices/system/cpu * directory. */ #ifndef _LINUX_CPU_H_ #define _LINUX_CPU_H_ #include <linux/node.h> #include <linux/compiler.h> #include <linux/cpumask.h> #include <linux/cpuhotplug.h> struct device; struct device_node; struct attribute_group; struct cpu { int node_id; /* The node which contains the CPU */ int hotpluggable; /* creates sysfs control file if hotpluggable */ struct device dev; }; extern void boot_cpu_init(void); extern void boot_cpu_hotplug_init(void); extern void cpu_init(void); extern void trap_init(void); extern int register_cpu(struct cpu *cpu, int num); extern struct device *get_cpu_device(unsigned cpu); extern bool cpu_is_hotpluggable(unsigned cpu); extern bool arch_match_cpu_phys_id(int cpu, u64 phys_id); extern bool arch_find_n_match_cpu_physical_id(struct device_node *cpun, int cpu, unsigned int *thread); extern int cpu_add_dev_attr(struct device_attribute *attr); extern void cpu_remove_dev_attr(struct device_attribute *attr); extern int cpu_add_dev_attr_group(struct attribute_group *attrs); extern void cpu_remove_dev_attr_group(struct attribute_group *attrs); extern ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_spectre_v1(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_spectre_v2(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_l1tf(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_mds(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_tsx_async_abort(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_itlb_multihit(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, const struct attribute_group **groups, const char *fmt, ...); #ifdef CONFIG_HOTPLUG_CPU extern void unregister_cpu(struct cpu *cpu); extern ssize_t arch_cpu_probe(const char *, size_t); extern ssize_t arch_cpu_release(const char *, size_t); #endif /* * These states are not related to the core CPU hotplug mechanism. They are * used by various (sub)architectures to track internal state */ #define CPU_ONLINE 0x0002 /* CPU is up */ #define CPU_UP_PREPARE 0x0003 /* CPU coming up */ #define CPU_DEAD 0x0007 /* CPU dead */ #define CPU_DEAD_FROZEN 0x0008 /* CPU timed out on unplug */ #define CPU_POST_DEAD 0x0009 /* CPU successfully unplugged */ #define CPU_BROKEN 0x000B /* CPU did not die properly */ #ifdef CONFIG_SMP extern bool cpuhp_tasks_frozen; int add_cpu(unsigned int cpu); int cpu_device_up(struct device *dev); void notify_cpu_starting(unsigned int cpu); extern void cpu_maps_update_begin(void); extern void cpu_maps_update_done(void); int bringup_hibernate_cpu(unsigned int sleep_cpu); void bringup_nonboot_cpus(unsigned int setup_max_cpus); #else /* CONFIG_SMP */ #define cpuhp_tasks_frozen 0 static inline void cpu_maps_update_begin(void) { } static inline void cpu_maps_update_done(void) { } #endif /* CONFIG_SMP */ extern struct bus_type cpu_subsys; #ifdef CONFIG_HOTPLUG_CPU extern void cpus_write_lock(void); extern void cpus_write_unlock(void); extern void cpus_read_lock(void); extern void cpus_read_unlock(void); extern int cpus_read_trylock(void); extern void lockdep_assert_cpus_held(void); extern void cpu_hotplug_disable(void); extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int remove_cpu(unsigned int cpu); int cpu_device_down(struct device *dev); extern void smp_shutdown_nonboot_cpus(unsigned int primary_cpu); #else /* CONFIG_HOTPLUG_CPU */ static inline void cpus_write_lock(void) { } static inline void cpus_write_unlock(void) { } static inline void cpus_read_lock(void) { } static inline void cpus_read_unlock(void) { } static inline int cpus_read_trylock(void) { return true; } static inline void lockdep_assert_cpus_held(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } #endif /* !CONFIG_HOTPLUG_CPU */ /* Wrappers which go away once all code is converted */ static inline void cpu_hotplug_begin(void) { cpus_write_lock(); } static inline void cpu_hotplug_done(void) { cpus_write_unlock(); } static inline void get_online_cpus(void) { cpus_read_lock(); } static inline void put_online_cpus(void) { cpus_read_unlock(); } #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); static inline int suspend_disable_secondary_cpus(void) { int cpu = 0; if (IS_ENABLED(CONFIG_PM_SLEEP_SMP_NONZERO_CPU)) cpu = -1; return freeze_secondary_cpus(cpu); } static inline void suspend_enable_secondary_cpus(void) { return thaw_secondary_cpus(); } #else /* !CONFIG_PM_SLEEP_SMP */ static inline void thaw_secondary_cpus(void) {} static inline int suspend_disable_secondary_cpus(void) { return 0; } static inline void suspend_enable_secondary_cpus(void) { } #endif /* !CONFIG_PM_SLEEP_SMP */ void cpu_startup_entry(enum cpuhp_state state); void cpu_idle_poll_ctrl(bool enable); /* Attach to any functions which should be considered cpuidle. */ #define __cpuidle __section(".cpuidle.text") bool cpu_in_idle(unsigned long pc); void arch_cpu_idle(void); void arch_cpu_idle_prepare(void); void arch_cpu_idle_enter(void); void arch_cpu_idle_exit(void); void arch_cpu_idle_dead(void); int cpu_report_state(int cpu); int cpu_check_up_prepare(int cpu); void cpu_set_state_online(int cpu); void play_idle_precise(u64 duration_ns, u64 latency_ns); static inline void play_idle(unsigned long duration_us) { play_idle_precise(duration_us * NSEC_PER_USEC, U64_MAX); } #ifdef CONFIG_HOTPLUG_CPU bool cpu_wait_death(unsigned int cpu, int seconds); bool cpu_report_death(void); void cpuhp_report_idle_dead(void); #else static inline void cpuhp_report_idle_dead(void) { } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ enum cpuhp_smt_control { CPU_SMT_ENABLED, CPU_SMT_DISABLED, CPU_SMT_FORCE_DISABLED, CPU_SMT_NOT_SUPPORTED, CPU_SMT_NOT_IMPLEMENTED, }; #if defined(CONFIG_SMP) && defined(CONFIG_HOTPLUG_SMT) extern enum cpuhp_smt_control cpu_smt_control; extern void cpu_smt_disable(bool force); extern void cpu_smt_check_topology(void); extern bool cpu_smt_possible(void); extern int cpuhp_smt_enable(void); extern int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval); #else # define cpu_smt_control (CPU_SMT_NOT_IMPLEMENTED) static inline void cpu_smt_disable(bool force) { } static inline void cpu_smt_check_topology(void) { } static inline bool cpu_smt_possible(void) { return false; } static inline int cpuhp_smt_enable(void) { return 0; } static inline int cpuhp_smt_disable(enum cpuhp_smt_control ctrlval) { return 0; } #endif extern bool cpu_mitigations_off(void); extern bool cpu_mitigations_auto_nosmt(void); #endif /* _LINUX_CPU_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 /* SPDX-License-Identifier: GPL-2.0 */ /* * include/linux/writeback.h */ #ifndef WRITEBACK_H #define WRITEBACK_H #include <linux/sched.h> #include <linux/workqueue.h> #include <linux/fs.h> #include <linux/flex_proportions.h> #include <linux/backing-dev-defs.h> #include <linux/blk_types.h> #include <linux/blk-cgroup.h> struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* * The 1/4 region under the global dirty thresh is for smooth dirty throttling: * * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) * * Further beyond, all dirtier tasks will enter a loop waiting (possibly long * time) for the dirty pages to drop, unless written enough pages. * * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 #define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) struct backing_dev_info; /* * fs/fs-writeback.c */ enum writeback_sync_modes { WB_SYNC_NONE, /* Don't wait on anything */ WB_SYNC_ALL, /* Wait on every mapping */ }; /* * A control structure which tells the writeback code what to do. These are * always on the stack, and hence need no locking. They are always initialised * in a manner such that unspecified fields are set to zero. */ struct writeback_control { long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ /* * For a_ops->writepages(): if start or end are non-zero then this is * a hint that the filesystem need only write out the pages inside that * byterange. The byte at `end' is included in the writeout request. */ loff_t range_start; loff_t range_end; enum writeback_sync_modes sync_mode; unsigned for_kupdate:1; /* A kupdate writeback */ unsigned for_background:1; /* A background writeback */ unsigned tagged_writepages:1; /* tag-and-write to avoid livelock */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ /* * When writeback IOs are bounced through async layers, only the * initial synchronous phase should be accounted towards inode * cgroup ownership arbitration to avoid confusion. Later stages * can set the following flag to disable the accounting. */ unsigned no_cgroup_owner:1; unsigned punt_to_cgroup:1; /* cgrp punting, see __REQ_CGROUP_PUNT */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *wb; /* wb this writeback is issued under */ struct inode *inode; /* inode being written out */ /* foreign inode detection, see wbc_detach_inode() */ int wb_id; /* current wb id */ int wb_lcand_id; /* last foreign candidate wb id */ int wb_tcand_id; /* this foreign candidate wb id */ size_t wb_bytes; /* bytes written by current wb */ size_t wb_lcand_bytes; /* bytes written by last candidate */ size_t wb_tcand_bytes; /* bytes written by this candidate */ #endif }; static inline int wbc_to_write_flags(struct writeback_control *wbc) { int flags = 0; if (wbc->punt_to_cgroup) flags = REQ_CGROUP_PUNT; if (wbc->sync_mode == WB_SYNC_ALL) flags |= REQ_SYNC; else if (wbc->for_kupdate || wbc->for_background) flags |= REQ_BACKGROUND; return flags; } static inline struct cgroup_subsys_state * wbc_blkcg_css(struct writeback_control *wbc) { #ifdef CONFIG_CGROUP_WRITEBACK if (wbc->wb) return wbc->wb->blkcg_css; #endif return blkcg_root_css; } /* * A wb_domain represents a domain that wb's (bdi_writeback's) belong to * and are measured against each other in. There always is one global * domain, global_wb_domain, that every wb in the system is a member of. * This allows measuring the relative bandwidth of each wb to distribute * dirtyable memory accordingly. */ struct wb_domain { spinlock_t lock; /* * Scale the writeback cache size proportional to the relative * writeout speed. * * We do this by keeping a floating proportion between BDIs, based * on page writeback completions [end_page_writeback()]. Those * devices that write out pages fastest will get the larger share, * while the slower will get a smaller share. * * We use page writeout completions because we are interested in * getting rid of dirty pages. Having them written out is the * primary goal. * * We introduce a concept of time, a period over which we measure * these events, because demand can/will vary over time. The length * of this period itself is measured in page writeback completions. */ struct fprop_global completions; struct timer_list period_timer; /* timer for aging of completions */ unsigned long period_time; /* * The dirtyable memory and dirty threshold could be suddenly * knocked down by a large amount (eg. on the startup of KVM in a * swapless system). This may throw the system into deep dirty * exceeded state and throttle heavy/light dirtiers alike. To * retain good responsiveness, maintain global_dirty_limit for * tracking slowly down to the knocked down dirty threshold. * * Both fields are protected by ->lock. */ unsigned long dirty_limit_tstamp; unsigned long dirty_limit; }; /** * wb_domain_size_changed - memory available to a wb_domain has changed * @dom: wb_domain of interest * * This function should be called when the amount of memory available to * @dom has changed. It resets @dom's dirty limit parameters to prevent * the past values which don't match the current configuration from skewing * dirty throttling. Without this, when memory size of a wb_domain is * greatly reduced, the dirty throttling logic may allow too many pages to * be dirtied leading to consecutive unnecessary OOMs and may get stuck in * that situation. */ static inline void wb_domain_size_changed(struct wb_domain *dom) { spin_lock(&dom->lock); dom->dirty_limit_tstamp = jiffies; dom->dirty_limit = 0; spin_unlock(&dom->lock); } /* * fs/fs-writeback.c */ struct bdi_writeback; void writeback_inodes_sb(struct super_block *, enum wb_reason reason); void writeback_inodes_sb_nr(struct super_block *, unsigned long nr, enum wb_reason reason); void try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason); void sync_inodes_sb(struct super_block *); void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) { might_sleep(); wait_on_bit(&inode->i_state, __I_NEW, TASK_UNINTERRUPTIBLE); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/cgroup.h> #include <linux/bio.h> void __inode_attach_wb(struct inode *inode, struct page *page); void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock); void wbc_detach_inode(struct writeback_control *wbc); void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes); int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, unsigned long nr_pages, enum wb_reason reason, struct wb_completion *done); void cgroup_writeback_umount(void); /** * inode_attach_wb - associate an inode with its wb * @inode: inode of interest * @page: page being dirtied (may be NULL) * * If @inode doesn't have its wb, associate it with the wb matching the * memcg of @page or, if @page is NULL, %current. May be called w/ or w/o * @inode->i_lock. */ static inline void inode_attach_wb(struct inode *inode, struct page *page) { if (!inode->i_wb) __inode_attach_wb(inode, page); } /** * inode_detach_wb - disassociate an inode from its wb * @inode: inode of interest * * @inode is being freed. Detach from its wb. */ static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } } /** * wbc_attach_fdatawrite_inode - associate wbc and inode for fdatawrite * @wbc: writeback_control of interest * @inode: target inode * * This function is to be used by __filemap_fdatawrite_range(), which is an * alternative entry point into writeback code, and first ensures @inode is * associated with a bdi_writeback and attaches it to @wbc. */ static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { spin_lock(&inode->i_lock); inode_attach_wb(inode, NULL); wbc_attach_and_unlock_inode(wbc, inode); } /** * wbc_init_bio - writeback specific initializtion of bio * @wbc: writeback_control for the writeback in progress * @bio: bio to be initialized * * @bio is a part of the writeback in progress controlled by @wbc. Perform * writeback specific initialization. This is used to apply the cgroup * writeback context. Must be called after the bio has been associated with * a device. */ static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { /* * pageout() path doesn't attach @wbc to the inode being written * out. This is intentional as we don't want the function to block * behind a slow cgroup. Ultimately, we want pageout() to kick off * regular writeback instead of writing things out itself. */ if (wbc->wb) bio_associate_blkg_from_css(bio, wbc->wb->blkcg_css); } #else /* CONFIG_CGROUP_WRITEBACK */ static inline void inode_attach_wb(struct inode *inode, struct page *page) { } static inline void inode_detach_wb(struct inode *inode) { } static inline void wbc_attach_and_unlock_inode(struct writeback_control *wbc, struct inode *inode) __releases(&inode->i_lock) { spin_unlock(&inode->i_lock); } static inline void wbc_attach_fdatawrite_inode(struct writeback_control *wbc, struct inode *inode) { } static inline void wbc_detach_inode(struct writeback_control *wbc) { } static inline void wbc_init_bio(struct writeback_control *wbc, struct bio *bio) { } static inline void wbc_account_cgroup_owner(struct writeback_control *wbc, struct page *page, size_t bytes) { } static inline void cgroup_writeback_umount(void) { } #endif /* CONFIG_CGROUP_WRITEBACK */ /* * mm/page-writeback.c */ #ifdef CONFIG_BLOCK void laptop_io_completion(struct backing_dev_info *info); void laptop_sync_completion(void); void laptop_mode_sync(struct work_struct *work); void laptop_mode_timer_fn(struct timer_list *t); #else static inline void laptop_sync_completion(void) { } #endif bool node_dirty_ok(struct pglist_data *pgdat); int wb_domain_init(struct wb_domain *dom, gfp_t gfp); #ifdef CONFIG_CGROUP_WRITEBACK void wb_domain_exit(struct wb_domain *dom); #endif extern struct wb_domain global_wb_domain; /* These are exported to sysctl. */ extern int dirty_background_ratio; extern unsigned long dirty_background_bytes; extern int vm_dirty_ratio; extern unsigned long vm_dirty_bytes; extern unsigned int dirty_writeback_interval; extern unsigned int dirty_expire_interval; extern unsigned int dirtytime_expire_interval; extern int vm_highmem_is_dirtyable; extern int block_dump; extern int laptop_mode; int dirty_background_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int dirty_background_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int dirty_bytes_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int dirtytime_interval_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); int dirty_writeback_centisecs_handler(struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos); void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty); unsigned long wb_calc_thresh(struct bdi_writeback *wb, unsigned long thresh); void wb_update_bandwidth(struct bdi_writeback *wb, unsigned long start_time); void balance_dirty_pages_ratelimited(struct address_space *mapping); bool wb_over_bg_thresh(struct bdi_writeback *wb); typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc, void *data); int generic_writepages(struct address_space *mapping, struct writeback_control *wbc); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data); int do_writepages(struct address_space *mapping, struct writeback_control *wbc); void writeback_set_ratelimit(void); void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end); void account_page_redirty(struct page *page); void sb_mark_inode_writeback(struct inode *inode); void sb_clear_inode_writeback(struct inode *inode); #endif /* WRITEBACK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * RNG: Random Number Generator algorithms under the crypto API * * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_RNG_H #define _CRYPTO_RNG_H #include <linux/crypto.h> struct crypto_rng; /** * struct rng_alg - random number generator definition * * @generate: The function defined by this variable obtains a * random number. The random number generator transform * must generate the random number out of the context * provided with this call, plus any additional data * if provided to the call. * @seed: Seed or reseed the random number generator. With the * invocation of this function call, the random number * generator shall become ready for generation. If the * random number generator requires a seed for setting * up a new state, the seed must be provided by the * consumer while invoking this function. The required * size of the seed is defined with @seedsize . * @set_ent: Set entropy that would otherwise be obtained from * entropy source. Internal use only. * @seedsize: The seed size required for a random number generator * initialization defined with this variable. Some * random number generators does not require a seed * as the seeding is implemented internally without * the need of support by the consumer. In this case, * the seed size is set to zero. * @base: Common crypto API algorithm data structure. */ struct rng_alg { int (*generate)(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen); int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); void (*set_ent)(struct crypto_rng *tfm, const u8 *data, unsigned int len); unsigned int seedsize; struct crypto_alg base; }; struct crypto_rng { struct crypto_tfm base; }; extern struct crypto_rng *crypto_default_rng; int crypto_get_default_rng(void); void crypto_put_default_rng(void); /** * DOC: Random number generator API * * The random number generator API is used with the ciphers of type * CRYPTO_ALG_TYPE_RNG (listed as type "rng" in /proc/crypto) */ /** * crypto_alloc_rng() -- allocate RNG handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * message digest cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a random number generator. The returned struct * crypto_rng is the cipher handle that is required for any subsequent * API invocation for that random number generator. * * For all random number generators, this call creates a new private copy of * the random number generator that does not share a state with other * instances. The only exception is the "krng" random number generator which * is a kernel crypto API use case for the get_random_bytes() function of the * /dev/random driver. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm) { return &tfm->base; } /** * crypto_rng_alg - obtain name of RNG * @tfm: cipher handle * * Return the generic name (cra_name) of the initialized random number generator * * Return: generic name string */ static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm) { return container_of(crypto_rng_tfm(tfm)->__crt_alg, struct rng_alg, base); } /** * crypto_free_rng() - zeroize and free RNG handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_rng(struct crypto_rng *tfm) { crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm)); } /** * crypto_rng_generate() - get random number * @tfm: cipher handle * @src: Input buffer holding additional data, may be NULL * @slen: Length of additional data * @dst: output buffer holding the random numbers * @dlen: length of the output buffer * * This function fills the caller-allocated buffer with random * numbers using the random number generator referenced by the * cipher handle. * * Return: 0 function was successful; < 0 if an error occurred */ static inline int crypto_rng_generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { struct crypto_alg *alg = tfm->base.__crt_alg; int ret; crypto_stats_get(alg); ret = crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen); crypto_stats_rng_generate(alg, dlen, ret); return ret; } /** * crypto_rng_get_bytes() - get random number * @tfm: cipher handle * @rdata: output buffer holding the random numbers * @dlen: length of the output buffer * * This function fills the caller-allocated buffer with random numbers using the * random number generator referenced by the cipher handle. * * Return: 0 function was successful; < 0 if an error occurred */ static inline int crypto_rng_get_bytes(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen) { return crypto_rng_generate(tfm, NULL, 0, rdata, dlen); } /** * crypto_rng_reset() - re-initialize the RNG * @tfm: cipher handle * @seed: seed input data * @slen: length of the seed input data * * The reset function completely re-initializes the random number generator * referenced by the cipher handle by clearing the current state. The new state * is initialized with the caller provided seed or automatically, depending * on the random number generator type (the ANSI X9.31 RNG requires * caller-provided seed, the SP800-90A DRBGs perform an automatic seeding). * The seed is provided as a parameter to this function call. The provided seed * should have the length of the seed size defined for the random number * generator as defined by crypto_rng_seedsize. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); /** * crypto_rng_seedsize() - obtain seed size of RNG * @tfm: cipher handle * * The function returns the seed size for the random number generator * referenced by the cipher handle. This value may be zero if the random * number generator does not implement or require a reseeding. For example, * the SP800-90A DRBGs implement an automated reseeding after reaching a * pre-defined threshold. * * Return: seed size for the random number generator */ static inline int crypto_rng_seedsize(struct crypto_rng *tfm) { return crypto_rng_alg(tfm)->seedsize; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 /* SPDX-License-Identifier: GPL-2.0-only */ /* * A policy database (policydb) specifies the * configuration data for the security policy. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * * Support for enhanced MLS infrastructure. * * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> * * Added conditional policy language extensions * * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004 Tresys Technology, LLC */ #ifndef _SS_POLICYDB_H_ #define _SS_POLICYDB_H_ #include "symtab.h" #include "avtab.h" #include "sidtab.h" #include "ebitmap.h" #include "mls_types.h" #include "context.h" #include "constraint.h" /* * A datum type is defined for each kind of symbol * in the configuration data: individual permissions, * common prefixes for access vectors, classes, * users, roles, types, sensitivities, categories, etc. */ /* Permission attributes */ struct perm_datum { u32 value; /* permission bit + 1 */ }; /* Attributes of a common prefix for access vectors */ struct common_datum { u32 value; /* internal common value */ struct symtab permissions; /* common permissions */ }; /* Class attributes */ struct class_datum { u32 value; /* class value */ char *comkey; /* common name */ struct common_datum *comdatum; /* common datum */ struct symtab permissions; /* class-specific permission symbol table */ struct constraint_node *constraints; /* constraints on class permissions */ struct constraint_node *validatetrans; /* special transition rules */ /* Options how a new object user, role, and type should be decided */ #define DEFAULT_SOURCE 1 #define DEFAULT_TARGET 2 char default_user; char default_role; char default_type; /* Options how a new object range should be decided */ #define DEFAULT_SOURCE_LOW 1 #define DEFAULT_SOURCE_HIGH 2 #define DEFAULT_SOURCE_LOW_HIGH 3 #define DEFAULT_TARGET_LOW 4 #define DEFAULT_TARGET_HIGH 5 #define DEFAULT_TARGET_LOW_HIGH 6 #define DEFAULT_GLBLUB 7 char default_range; }; /* Role attributes */ struct role_datum { u32 value; /* internal role value */ u32 bounds; /* boundary of role */ struct ebitmap dominates; /* set of roles dominated by this role */ struct ebitmap types; /* set of authorized types for role */ }; struct role_trans_key { u32 role; /* current role */ u32 type; /* program executable type, or new object type */ u32 tclass; /* process class, or new object class */ }; struct role_trans_datum { u32 new_role; /* new role */ }; struct filename_trans_key { u32 ttype; /* parent dir context */ u16 tclass; /* class of new object */ const char *name; /* last path component */ }; struct filename_trans_datum { struct ebitmap stypes; /* bitmap of source types for this otype */ u32 otype; /* resulting type of new object */ struct filename_trans_datum *next; /* record for next otype*/ }; struct role_allow { u32 role; /* current role */ u32 new_role; /* new role */ struct role_allow *next; }; /* Type attributes */ struct type_datum { u32 value; /* internal type value */ u32 bounds; /* boundary of type */ unsigned char primary; /* primary name? */ unsigned char attribute;/* attribute ?*/ }; /* User attributes */ struct user_datum { u32 value; /* internal user value */ u32 bounds; /* bounds of user */ struct ebitmap roles; /* set of authorized roles for user */ struct mls_range range; /* MLS range (min - max) for user */ struct mls_level dfltlevel; /* default login MLS level for user */ }; /* Sensitivity attributes */ struct level_datum { struct mls_level *level; /* sensitivity and associated categories */ unsigned char isalias; /* is this sensitivity an alias for another? */ }; /* Category attributes */ struct cat_datum { u32 value; /* internal category bit + 1 */ unsigned char isalias; /* is this category an alias for another? */ }; struct range_trans { u32 source_type; u32 target_type; u32 target_class; }; /* Boolean data type */ struct cond_bool_datum { __u32 value; /* internal type value */ int state; }; struct cond_node; /* * type set preserves data needed to determine constraint info from * policy source. This is not used by the kernel policy but allows * utilities such as audit2allow to determine constraint denials. */ struct type_set { struct ebitmap types; struct ebitmap negset; u32 flags; }; /* * The configuration data includes security contexts for * initial SIDs, unlabeled file systems, TCP and UDP port numbers, * network interfaces, and nodes. This structure stores the * relevant data for one such entry. Entries of the same kind * (e.g. all initial SIDs) are linked together into a list. */ struct ocontext { union { char *name; /* name of initial SID, fs, netif, fstype, path */ struct { u8 protocol; u16 low_port; u16 high_port; } port; /* TCP or UDP port information */ struct { u32 addr; u32 mask; } node; /* node information */ struct { u32 addr[4]; u32 mask[4]; } node6; /* IPv6 node information */ struct { u64 subnet_prefix; u16 low_pkey; u16 high_pkey; } ibpkey; struct { char *dev_name; u8 port; } ibendport; } u; union { u32 sclass; /* security class for genfs */ u32 behavior; /* labeling behavior for fs_use */ } v; struct context context[2]; /* security context(s) */ u32 sid[2]; /* SID(s) */ struct ocontext *next; }; struct genfs { char *fstype; struct ocontext *head; struct genfs *next; }; /* symbol table array indices */ #define SYM_COMMONS 0 #define SYM_CLASSES 1 #define SYM_ROLES 2 #define SYM_TYPES 3 #define SYM_USERS 4 #define SYM_BOOLS 5 #define SYM_LEVELS 6 #define SYM_CATS 7 #define SYM_NUM 8 /* object context array indices */ #define OCON_ISID 0 /* initial SIDs */ #define OCON_FS 1 /* unlabeled file systems */ #define OCON_PORT 2 /* TCP and UDP port numbers */ #define OCON_NETIF 3 /* network interfaces */ #define OCON_NODE 4 /* nodes */ #define OCON_FSUSE 5 /* fs_use */ #define OCON_NODE6 6 /* IPv6 nodes */ #define OCON_IBPKEY 7 /* Infiniband PKeys */ #define OCON_IBENDPORT 8 /* Infiniband end ports */ #define OCON_NUM 9 /* The policy database */ struct policydb { int mls_enabled; /* symbol tables */ struct symtab symtab[SYM_NUM]; #define p_commons symtab[SYM_COMMONS] #define p_classes symtab[SYM_CLASSES] #define p_roles symtab[SYM_ROLES] #define p_types symtab[SYM_TYPES] #define p_users symtab[SYM_USERS] #define p_bools symtab[SYM_BOOLS] #define p_levels symtab[SYM_LEVELS] #define p_cats symtab[SYM_CATS] /* symbol names indexed by (value - 1) */ char **sym_val_to_name[SYM_NUM]; /* class, role, and user attributes indexed by (value - 1) */ struct class_datum **class_val_to_struct; struct role_datum **role_val_to_struct; struct user_datum **user_val_to_struct; struct type_datum **type_val_to_struct; /* type enforcement access vectors and transitions */ struct avtab te_avtab; /* role transitions */ struct hashtab role_tr; /* file transitions with the last path component */ /* quickly exclude lookups when parent ttype has no rules */ struct ebitmap filename_trans_ttypes; /* actual set of filename_trans rules */ struct hashtab filename_trans; /* only used if policyvers < POLICYDB_VERSION_COMP_FTRANS */ u32 compat_filename_trans_count; /* bools indexed by (value - 1) */ struct cond_bool_datum **bool_val_to_struct; /* type enforcement conditional access vectors and transitions */ struct avtab te_cond_avtab; /* array indexing te_cond_avtab by conditional */ struct cond_node *cond_list; u32 cond_list_len; /* role allows */ struct role_allow *role_allow; /* security contexts of initial SIDs, unlabeled file systems, TCP or UDP port numbers, network interfaces and nodes */ struct ocontext *ocontexts[OCON_NUM]; /* security contexts for files in filesystems that cannot support a persistent label mapping or use another fixed labeling behavior. */ struct genfs *genfs; /* range transitions table (range_trans_key -> mls_range) */ struct hashtab range_tr; /* type -> attribute reverse mapping */ struct ebitmap *type_attr_map_array; struct ebitmap policycaps; struct ebitmap permissive_map; /* length of this policy when it was loaded */ size_t len; unsigned int policyvers; unsigned int reject_unknown : 1; unsigned int allow_unknown : 1; u16 process_class; u32 process_trans_perms; } __randomize_layout; extern void policydb_destroy(struct policydb *p); extern int policydb_load_isids(struct policydb *p, struct sidtab *s); extern int policydb_context_isvalid(struct policydb *p, struct context *c); extern int policydb_class_isvalid(struct policydb *p, unsigned int class); extern int policydb_type_isvalid(struct policydb *p, unsigned int type); extern int policydb_role_isvalid(struct policydb *p, unsigned int role); extern int policydb_read(struct policydb *p, void *fp); extern int policydb_write(struct policydb *p, void *fp); extern struct filename_trans_datum *policydb_filenametr_search( struct policydb *p, struct filename_trans_key *key); extern struct mls_range *policydb_rangetr_search( struct policydb *p, struct range_trans *key); extern struct role_trans_datum *policydb_roletr_search( struct policydb *p, struct role_trans_key *key); #define POLICYDB_CONFIG_MLS 1 /* the config flags related to unknown classes/perms are bits 2 and 3 */ #define REJECT_UNKNOWN 0x00000002 #define ALLOW_UNKNOWN 0x00000004 #define OBJECT_R "object_r" #define OBJECT_R_VAL 1 #define POLICYDB_MAGIC SELINUX_MAGIC #define POLICYDB_STRING "SE Linux" struct policy_file { char *data; size_t len; }; struct policy_data { struct policydb *p; void *fp; }; static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) { if (bytes > fp->len) return -EINVAL; memcpy(buf, fp->data, bytes); fp->data += bytes; fp->len -= bytes; return 0; } static inline int put_entry(const void *buf, size_t bytes, int num, struct policy_file *fp) { size_t len = bytes * num; memcpy(fp->data, buf, len); fp->data += len; fp->len -= len; return 0; } static inline char *sym_name(struct policydb *p, unsigned int sym_num, unsigned int element_nr) { return p->sym_val_to_name[sym_num][element_nr]; } extern u16 string_to_security_class(struct policydb *p, const char *name); extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name); #endif /* _SS_POLICYDB_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220