1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_FLOW_DISSECTOR_H #define _NET_FLOW_DISSECTOR_H #include <linux/types.h> #include <linux/in6.h> #include <linux/siphash.h> #include <linux/string.h> #include <uapi/linux/if_ether.h> struct bpf_prog; struct net; struct sk_buff; /** * struct flow_dissector_key_control: * @thoff: Transport header offset */ struct flow_dissector_key_control { u16 thoff; u16 addr_type; u32 flags; }; #define FLOW_DIS_IS_FRAGMENT BIT(0) #define FLOW_DIS_FIRST_FRAG BIT(1) #define FLOW_DIS_ENCAPSULATION BIT(2) enum flow_dissect_ret { FLOW_DISSECT_RET_OUT_GOOD, FLOW_DISSECT_RET_OUT_BAD, FLOW_DISSECT_RET_PROTO_AGAIN, FLOW_DISSECT_RET_IPPROTO_AGAIN, FLOW_DISSECT_RET_CONTINUE, }; /** * struct flow_dissector_key_basic: * @n_proto: Network header protocol (eg. IPv4/IPv6) * @ip_proto: Transport header protocol (eg. TCP/UDP) */ struct flow_dissector_key_basic { __be16 n_proto; u8 ip_proto; u8 padding; }; struct flow_dissector_key_tags { u32 flow_label; }; struct flow_dissector_key_vlan { union { struct { u16 vlan_id:12, vlan_dei:1, vlan_priority:3; }; __be16 vlan_tci; }; __be16 vlan_tpid; }; struct flow_dissector_mpls_lse { u32 mpls_ttl:8, mpls_bos:1, mpls_tc:3, mpls_label:20; }; #define FLOW_DIS_MPLS_MAX 7 struct flow_dissector_key_mpls { struct flow_dissector_mpls_lse ls[FLOW_DIS_MPLS_MAX]; /* Label Stack */ u8 used_lses; /* One bit set for each Label Stack Entry in use */ }; static inline void dissector_set_mpls_lse(struct flow_dissector_key_mpls *mpls, int lse_index) { mpls->used_lses |= 1 << lse_index; } #define FLOW_DIS_TUN_OPTS_MAX 255 /** * struct flow_dissector_key_enc_opts: * @data: tunnel option data * @len: length of tunnel option data * @dst_opt_type: tunnel option type */ struct flow_dissector_key_enc_opts { u8 data[FLOW_DIS_TUN_OPTS_MAX]; /* Using IP_TUNNEL_OPTS_MAX is desired * here but seems difficult to #include */ u8 len; __be16 dst_opt_type; }; struct flow_dissector_key_keyid { __be32 keyid; }; /** * struct flow_dissector_key_ipv4_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv4_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ __be32 src; __be32 dst; }; /** * struct flow_dissector_key_ipv6_addrs: * @src: source ip address * @dst: destination ip address */ struct flow_dissector_key_ipv6_addrs { /* (src,dst) must be grouped, in the same way than in IP header */ struct in6_addr src; struct in6_addr dst; }; /** * struct flow_dissector_key_tipc: * @key: source node address combined with selector */ struct flow_dissector_key_tipc { __be32 key; }; /** * struct flow_dissector_key_addrs: * @v4addrs: IPv4 addresses * @v6addrs: IPv6 addresses */ struct flow_dissector_key_addrs { union { struct flow_dissector_key_ipv4_addrs v4addrs; struct flow_dissector_key_ipv6_addrs v6addrs; struct flow_dissector_key_tipc tipckey; }; }; /** * flow_dissector_key_arp: * @ports: Operation, source and target addresses for an ARP header * for Ethernet hardware addresses and IPv4 protocol addresses * sip: Sender IP address * tip: Target IP address * op: Operation * sha: Sender hardware address * tpa: Target hardware address */ struct flow_dissector_key_arp { __u32 sip; __u32 tip; __u8 op; unsigned char sha[ETH_ALEN]; unsigned char tha[ETH_ALEN]; }; /** * flow_dissector_key_tp_ports: * @ports: port numbers of Transport header * src: source port number * dst: destination port number */ struct flow_dissector_key_ports { union { __be32 ports; struct { __be16 src; __be16 dst; }; }; }; /** * flow_dissector_key_icmp: * type: ICMP type * code: ICMP code * id: session identifier */ struct flow_dissector_key_icmp { struct { u8 type; u8 code; }; u16 id; }; /** * struct flow_dissector_key_eth_addrs: * @src: source Ethernet address * @dst: destination Ethernet address */ struct flow_dissector_key_eth_addrs { /* (dst,src) must be grouped, in the same way than in ETH header */ unsigned char dst[ETH_ALEN]; unsigned char src[ETH_ALEN]; }; /** * struct flow_dissector_key_tcp: * @flags: flags */ struct flow_dissector_key_tcp { __be16 flags; }; /** * struct flow_dissector_key_ip: * @tos: tos * @ttl: ttl */ struct flow_dissector_key_ip { __u8 tos; __u8 ttl; }; /** * struct flow_dissector_key_meta: * @ingress_ifindex: ingress ifindex * @ingress_iftype: ingress interface type */ struct flow_dissector_key_meta { int ingress_ifindex; u16 ingress_iftype; }; /** * struct flow_dissector_key_ct: * @ct_state: conntrack state after converting with map * @ct_mark: conttrack mark * @ct_zone: conntrack zone * @ct_labels: conntrack labels */ struct flow_dissector_key_ct { u16 ct_state; u16 ct_zone; u32 ct_mark; u32 ct_labels[4]; }; /** * struct flow_dissector_key_hash: * @hash: hash value */ struct flow_dissector_key_hash { u32 hash; }; enum flow_dissector_key_id { FLOW_DISSECTOR_KEY_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_BASIC, /* struct flow_dissector_key_basic */ FLOW_DISSECTOR_KEY_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_PORTS_RANGE, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_ICMP, /* struct flow_dissector_key_icmp */ FLOW_DISSECTOR_KEY_ETH_ADDRS, /* struct flow_dissector_key_eth_addrs */ FLOW_DISSECTOR_KEY_TIPC, /* struct flow_dissector_key_tipc */ FLOW_DISSECTOR_KEY_ARP, /* struct flow_dissector_key_arp */ FLOW_DISSECTOR_KEY_VLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_FLOW_LABEL, /* struct flow_dissector_key_tags */ FLOW_DISSECTOR_KEY_GRE_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_MPLS_ENTROPY, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_KEYID, /* struct flow_dissector_key_keyid */ FLOW_DISSECTOR_KEY_ENC_IPV4_ADDRS, /* struct flow_dissector_key_ipv4_addrs */ FLOW_DISSECTOR_KEY_ENC_IPV6_ADDRS, /* struct flow_dissector_key_ipv6_addrs */ FLOW_DISSECTOR_KEY_ENC_CONTROL, /* struct flow_dissector_key_control */ FLOW_DISSECTOR_KEY_ENC_PORTS, /* struct flow_dissector_key_ports */ FLOW_DISSECTOR_KEY_MPLS, /* struct flow_dissector_key_mpls */ FLOW_DISSECTOR_KEY_TCP, /* struct flow_dissector_key_tcp */ FLOW_DISSECTOR_KEY_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_CVLAN, /* struct flow_dissector_key_vlan */ FLOW_DISSECTOR_KEY_ENC_IP, /* struct flow_dissector_key_ip */ FLOW_DISSECTOR_KEY_ENC_OPTS, /* struct flow_dissector_key_enc_opts */ FLOW_DISSECTOR_KEY_META, /* struct flow_dissector_key_meta */ FLOW_DISSECTOR_KEY_CT, /* struct flow_dissector_key_ct */ FLOW_DISSECTOR_KEY_HASH, /* struct flow_dissector_key_hash */ FLOW_DISSECTOR_KEY_MAX, }; #define FLOW_DISSECTOR_F_PARSE_1ST_FRAG BIT(0) #define FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL BIT(1) #define FLOW_DISSECTOR_F_STOP_AT_ENCAP BIT(2) struct flow_dissector_key { enum flow_dissector_key_id key_id; size_t offset; /* offset of struct flow_dissector_key_* in target the struct */ }; struct flow_dissector { unsigned int used_keys; /* each bit repesents presence of one key id */ unsigned short int offset[FLOW_DISSECTOR_KEY_MAX]; }; struct flow_keys_basic { struct flow_dissector_key_control control; struct flow_dissector_key_basic basic; }; struct flow_keys { struct flow_dissector_key_control control; #define FLOW_KEYS_HASH_START_FIELD basic struct flow_dissector_key_basic basic __aligned(SIPHASH_ALIGNMENT); struct flow_dissector_key_tags tags; struct flow_dissector_key_vlan vlan; struct flow_dissector_key_vlan cvlan; struct flow_dissector_key_keyid keyid; struct flow_dissector_key_ports ports; struct flow_dissector_key_icmp icmp; /* 'addrs' must be the last member */ struct flow_dissector_key_addrs addrs; }; #define FLOW_KEYS_HASH_OFFSET \ offsetof(struct flow_keys, FLOW_KEYS_HASH_START_FIELD) __be32 flow_get_u32_src(const struct flow_keys *flow); __be32 flow_get_u32_dst(const struct flow_keys *flow); extern struct flow_dissector flow_keys_dissector; extern struct flow_dissector flow_keys_basic_dissector; /* struct flow_keys_digest: * * This structure is used to hold a digest of the full flow keys. This is a * larger "hash" of a flow to allow definitively matching specific flows where * the 32 bit skb->hash is not large enough. The size is limited to 16 bytes so * that it can be used in CB of skb (see sch_choke for an example). */ #define FLOW_KEYS_DIGEST_LEN 16 struct flow_keys_digest { u8 data[FLOW_KEYS_DIGEST_LEN]; }; void make_flow_keys_digest(struct flow_keys_digest *digest, const struct flow_keys *flow); static inline bool flow_keys_have_l4(const struct flow_keys *keys) { return (keys->ports.ports || keys->tags.flow_label); } u32 flow_hash_from_keys(struct flow_keys *keys); void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, void *data, int thoff, int hlen); static inline bool dissector_uses_key(const struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) { return flow_dissector->used_keys & (1 << key_id); } static inline void *skb_flow_dissector_target(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id, void *target_container) { return ((char *)target_container) + flow_dissector->offset[key_id]; } struct bpf_flow_dissector { struct bpf_flow_keys *flow_keys; const struct sk_buff *skb; void *data; void *data_end; }; static inline void flow_dissector_init_keys(struct flow_dissector_key_control *key_control, struct flow_dissector_key_basic *key_basic) { memset(key_control, 0, sizeof(*key_control)); memset(key_basic, 0, sizeof(*key_basic)); } #ifdef CONFIG_BPF_SYSCALL int flow_dissector_bpf_prog_attach_check(struct net *net, struct bpf_prog *prog); #endif /* CONFIG_BPF_SYSCALL */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PTRACE_H #define _LINUX_PTRACE_H #include <linux/compiler.h> /* For unlikely. */ #include <linux/sched.h> /* For struct task_struct. */ #include <linux/sched/signal.h> /* For send_sig(), same_thread_group(), etc. */ #include <linux/err.h> /* for IS_ERR_VALUE */ #include <linux/bug.h> /* For BUG_ON. */ #include <linux/pid_namespace.h> /* For task_active_pid_ns. */ #include <uapi/linux/ptrace.h> #include <linux/seccomp.h> /* Add sp to seccomp_data, as seccomp is user API, we don't want to modify it */ struct syscall_info { __u64 sp; struct seccomp_data data; }; extern int ptrace_access_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, unsigned int gup_flags); /* * Ptrace flags * * The owner ship rules for task->ptrace which holds the ptrace * flags is simple. When a task is running it owns it's task->ptrace * flags. When the a task is stopped the ptracer owns task->ptrace. */ #define PT_SEIZED 0x00010000 /* SEIZE used, enable new behavior */ #define PT_PTRACED 0x00000001 #define PT_DTRACE 0x00000002 /* delayed trace (used on m68k, i386) */ #define PT_OPT_FLAG_SHIFT 3 /* PT_TRACE_* event enable flags */ #define PT_EVENT_FLAG(event) (1 << (PT_OPT_FLAG_SHIFT + (event))) #define PT_TRACESYSGOOD PT_EVENT_FLAG(0) #define PT_TRACE_FORK PT_EVENT_FLAG(PTRACE_EVENT_FORK) #define PT_TRACE_VFORK PT_EVENT_FLAG(PTRACE_EVENT_VFORK) #define PT_TRACE_CLONE PT_EVENT_FLAG(PTRACE_EVENT_CLONE) #define PT_TRACE_EXEC PT_EVENT_FLAG(PTRACE_EVENT_EXEC) #define PT_TRACE_VFORK_DONE PT_EVENT_FLAG(PTRACE_EVENT_VFORK_DONE) #define PT_TRACE_EXIT PT_EVENT_FLAG(PTRACE_EVENT_EXIT) #define PT_TRACE_SECCOMP PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP) #define PT_EXITKILL (PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT) #define PT_SUSPEND_SECCOMP (PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT) /* single stepping state bits (used on ARM and PA-RISC) */ #define PT_SINGLESTEP_BIT 31 #define PT_SINGLESTEP (1<<PT_SINGLESTEP_BIT) #define PT_BLOCKSTEP_BIT 30 #define PT_BLOCKSTEP (1<<PT_BLOCKSTEP_BIT) extern long arch_ptrace(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent, const struct cred *ptracer_cred); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer, struct list_head *dead); #define PTRACE_MODE_READ 0x01 #define PTRACE_MODE_ATTACH 0x02 #define PTRACE_MODE_NOAUDIT 0x04 #define PTRACE_MODE_FSCREDS 0x08 #define PTRACE_MODE_REALCREDS 0x10 /* shorthands for READ/ATTACH and FSCREDS/REALCREDS combinations */ #define PTRACE_MODE_READ_FSCREDS (PTRACE_MODE_READ | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_READ_REALCREDS (PTRACE_MODE_READ | PTRACE_MODE_REALCREDS) #define PTRACE_MODE_ATTACH_FSCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_FSCREDS) #define PTRACE_MODE_ATTACH_REALCREDS (PTRACE_MODE_ATTACH | PTRACE_MODE_REALCREDS) /** * ptrace_may_access - check whether the caller is permitted to access * a target task. * @task: target task * @mode: selects type of access and caller credentials * * Returns true on success, false on denial. * * One of the flags PTRACE_MODE_FSCREDS and PTRACE_MODE_REALCREDS must * be set in @mode to specify whether the access was requested through * a filesystem syscall (should use effective capabilities and fsuid * of the caller) or through an explicit syscall such as * process_vm_writev or ptrace (and should use the real credentials). */ extern bool ptrace_may_access(struct task_struct *task, unsigned int mode); static inline int ptrace_reparented(struct task_struct *child) { return !same_thread_group(child->real_parent, child->parent); } static inline void ptrace_unlink(struct task_struct *child) { if (unlikely(child->ptrace)) __ptrace_unlink(child); } int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr, unsigned long data); int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr, unsigned long data); /** * ptrace_parent - return the task that is tracing the given task * @task: task to consider * * Returns %NULL if no one is tracing @task, or the &struct task_struct * pointer to its tracer. * * Must called under rcu_read_lock(). The pointer returned might be kept * live only by RCU. During exec, this may be called with task_lock() held * on @task, still held from when check_unsafe_exec() was called. */ static inline struct task_struct *ptrace_parent(struct task_struct *task) { if (unlikely(task->ptrace)) return rcu_dereference(task->parent); return NULL; } /** * ptrace_event_enabled - test whether a ptrace event is enabled * @task: ptracee of interest * @event: %PTRACE_EVENT_* to test * * Test whether @event is enabled for ptracee @task. * * Returns %true if @event is enabled, %false otherwise. */ static inline bool ptrace_event_enabled(struct task_struct *task, int event) { return task->ptrace & PT_EVENT_FLAG(event); } /** * ptrace_event - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @message: value for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @message * to the ptrace parent. * * Called without locks. */ static inline void ptrace_event(int event, unsigned long message) { if (unlikely(ptrace_event_enabled(current, event))) { current->ptrace_message = message; ptrace_notify((event << 8) | SIGTRAP); } else if (event == PTRACE_EVENT_EXEC) { /* legacy EXEC report via SIGTRAP */ if ((current->ptrace & (PT_PTRACED|PT_SEIZED)) == PT_PTRACED) send_sig(SIGTRAP, current, 0); } } /** * ptrace_event_pid - possibly stop for a ptrace event notification * @event: %PTRACE_EVENT_* value to report * @pid: process identifier for %PTRACE_GETEVENTMSG to return * * Check whether @event is enabled and, if so, report @event and @pid * to the ptrace parent. @pid is reported as the pid_t seen from the * the ptrace parent's pid namespace. * * Called without locks. */ static inline void ptrace_event_pid(int event, struct pid *pid) { /* * FIXME: There's a potential race if a ptracer in a different pid * namespace than parent attaches between computing message below and * when we acquire tasklist_lock in ptrace_stop(). If this happens, * the ptracer will get a bogus pid from PTRACE_GETEVENTMSG. */ unsigned long message = 0; struct pid_namespace *ns; rcu_read_lock(); ns = task_active_pid_ns(rcu_dereference(current->parent)); if (ns) message = pid_nr_ns(pid, ns); rcu_read_unlock(); ptrace_event(event, message); } /** * ptrace_init_task - initialize ptrace state for a new child * @child: new child task * @ptrace: true if child should be ptrace'd by parent's tracer * * This is called immediately after adding @child to its parent's children * list. @ptrace is false in the normal case, and true to ptrace @child. * * Called with current's siglock and write_lock_irq(&tasklist_lock) held. */ static inline void ptrace_init_task(struct task_struct *child, bool ptrace) { INIT_LIST_HEAD(&child->ptrace_entry); INIT_LIST_HEAD(&child->ptraced); child->jobctl = 0; child->ptrace = 0; child->parent = child->real_parent; if (unlikely(ptrace) && current->ptrace) { child->ptrace = current->ptrace; __ptrace_link(child, current->parent, current->ptracer_cred); if (child->ptrace & PT_SEIZED) task_set_jobctl_pending(child, JOBCTL_TRAP_STOP); else sigaddset(&child->pending.signal, SIGSTOP); } else child->ptracer_cred = NULL; } /** * ptrace_release_task - final ptrace-related cleanup of a zombie being reaped * @task: task in %EXIT_DEAD state * * Called with write_lock(&tasklist_lock) held. */ static inline void ptrace_release_task(struct task_struct *task) { BUG_ON(!list_empty(&task->ptraced)); ptrace_unlink(task); BUG_ON(!list_empty(&task->ptrace_entry)); } #ifndef force_successful_syscall_return /* * System call handlers that, upon successful completion, need to return a * negative value should call force_successful_syscall_return() right before * returning. On architectures where the syscall convention provides for a * separate error flag (e.g., alpha, ia64, ppc{,64}, sparc{,64}, possibly * others), this macro can be used to ensure that the error flag will not get * set. On architectures which do not support a separate error flag, the macro * is a no-op and the spurious error condition needs to be filtered out by some * other means (e.g., in user-level, by passing an extra argument to the * syscall handler, or something along those lines). */ #define force_successful_syscall_return() do { } while (0) #endif #ifndef is_syscall_success /* * On most systems we can tell if a syscall is a success based on if the retval * is an error value. On some systems like ia64 and powerpc they have different * indicators of success/failure and must define their own. */ #define is_syscall_success(regs) (!IS_ERR_VALUE((unsigned long)(regs_return_value(regs)))) #endif /* * <asm/ptrace.h> should define the following things inside #ifdef __KERNEL__. * * These do-nothing inlines are used when the arch does not * implement single-step. The kerneldoc comments are here * to document the interface for all arch definitions. */ #ifndef arch_has_single_step /** * arch_has_single_step - does this CPU support user-mode single-step? * * If this is defined, then there must be function declarations or * inlines for user_enable_single_step() and user_disable_single_step(). * arch_has_single_step() should evaluate to nonzero iff the machine * supports instruction single-step for user mode. * It can be a constant or it can test a CPU feature bit. */ #define arch_has_single_step() (0) /** * user_enable_single_step - single-step in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_single_step() has returned nonzero. * Set @task so that when it returns to user mode, it will trap after the * next single instruction executes. If arch_has_block_step() is defined, * this must clear the effects of user_enable_block_step() too. */ static inline void user_enable_single_step(struct task_struct *task) { BUG(); /* This can never be called. */ } /** * user_disable_single_step - cancel user-mode single-step * @task: either current or a task stopped in %TASK_TRACED * * Clear @task of the effects of user_enable_single_step() and * user_enable_block_step(). This can be called whether or not either * of those was ever called on @task, and even if arch_has_single_step() * returned zero. */ static inline void user_disable_single_step(struct task_struct *task) { } #else extern void user_enable_single_step(struct task_struct *); extern void user_disable_single_step(struct task_struct *); #endif /* arch_has_single_step */ #ifndef arch_has_block_step /** * arch_has_block_step - does this CPU support user-mode block-step? * * If this is defined, then there must be a function declaration or inline * for user_enable_block_step(), and arch_has_single_step() must be defined * too. arch_has_block_step() should evaluate to nonzero iff the machine * supports step-until-branch for user mode. It can be a constant or it * can test a CPU feature bit. */ #define arch_has_block_step() (0) /** * user_enable_block_step - step until branch in user-mode task * @task: either current or a task stopped in %TASK_TRACED * * This can only be called when arch_has_block_step() has returned nonzero, * and will never be called when single-instruction stepping is being used. * Set @task so that when it returns to user mode, it will trap after the * next branch or trap taken. */ static inline void user_enable_block_step(struct task_struct *task) { BUG(); /* This can never be called. */ } #else extern void user_enable_block_step(struct task_struct *); #endif /* arch_has_block_step */ #ifdef ARCH_HAS_USER_SINGLE_STEP_REPORT extern void user_single_step_report(struct pt_regs *regs); #else static inline void user_single_step_report(struct pt_regs *regs) { kernel_siginfo_t info; clear_siginfo(&info); info.si_signo = SIGTRAP; info.si_errno = 0; info.si_code = SI_USER; info.si_pid = 0; info.si_uid = 0; force_sig_info(&info); } #endif #ifndef arch_ptrace_stop_needed /** * arch_ptrace_stop_needed - Decide whether arch_ptrace_stop() should be called * @code: current->exit_code value ptrace will stop with * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with the siglock held, to decide whether or not it's * necessary to release the siglock and call arch_ptrace_stop() with the * same @code and @info arguments. It can be defined to a constant if * arch_ptrace_stop() is never required, or always is. On machines where * this makes sense, it should be defined to a quick test to optimize out * calling arch_ptrace_stop() when it would be superfluous. For example, * if the thread has not been back to user mode since the last stop, the * thread state might indicate that nothing needs to be done. * * This is guaranteed to be invoked once before a task stops for ptrace and * may include arch-specific operations necessary prior to a ptrace stop. */ #define arch_ptrace_stop_needed(code, info) (0) #endif #ifndef arch_ptrace_stop /** * arch_ptrace_stop - Do machine-specific work before stopping for ptrace * @code: current->exit_code value ptrace will stop with * @info: siginfo_t pointer (or %NULL) for signal ptrace will stop with * * This is called with no locks held when arch_ptrace_stop_needed() has * just returned nonzero. It is allowed to block, e.g. for user memory * access. The arch can have machine-specific work to be done before * ptrace stops. On ia64, register backing store gets written back to user * memory here. Since this can be costly (requires dropping the siglock), * we only do it when the arch requires it for this particular stop, as * indicated by arch_ptrace_stop_needed(). */ #define arch_ptrace_stop(code, info) do { } while (0) #endif #ifndef current_pt_regs #define current_pt_regs() task_pt_regs(current) #endif /* * unlike current_pt_regs(), this one is equal to task_pt_regs(current) * on *all* architectures; the only reason to have a per-arch definition * is optimisation. */ #ifndef signal_pt_regs #define signal_pt_regs() task_pt_regs(current) #endif #ifndef current_user_stack_pointer #define current_user_stack_pointer() user_stack_pointer(current_pt_regs()) #endif extern int task_current_syscall(struct task_struct *target, struct syscall_info *info); extern void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_COMMON_H #define _INET_COMMON_H #include <linux/indirect_call_wrapper.h> extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; /* * INET4 prototypes used by INET6 */ struct msghdr; struct sock; struct sockaddr; struct socket; int inet_release(struct socket *sock); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features); static inline void inet_ctl_sock_destroy(struct sock *sk) { if (sk) sock_release(sk->sk_socket); } #define indirect_call_gro_receive(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_2(cb, f2, f1, head, skb); \ }) #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 /* SPDX-License-Identifier: GPL-2.0 */ /* * Macros for manipulating and testing page->flags */ #ifndef PAGE_FLAGS_H #define PAGE_FLAGS_H #include <linux/types.h> #include <linux/bug.h> #include <linux/mmdebug.h> #ifndef __GENERATING_BOUNDS_H #include <linux/mm_types.h> #include <generated/bounds.h> #endif /* !__GENERATING_BOUNDS_H */ /* * Various page->flags bits: * * PG_reserved is set for special pages. The "struct page" of such a page * should in general not be touched (e.g. set dirty) except by its owner. * Pages marked as PG_reserved include: * - Pages part of the kernel image (including vDSO) and similar (e.g. BIOS, * initrd, HW tables) * - Pages reserved or allocated early during boot (before the page allocator * was initialized). This includes (depending on the architecture) the * initial vmemmap, initial page tables, crashkernel, elfcorehdr, and much * much more. Once (if ever) freed, PG_reserved is cleared and they will * be given to the page allocator. * - Pages falling into physical memory gaps - not IORESOURCE_SYSRAM. Trying * to read/write these pages might end badly. Don't touch! * - The zero page(s) * - Pages not added to the page allocator when onlining a section because * they were excluded via the online_page_callback() or because they are * PG_hwpoison. * - Pages allocated in the context of kexec/kdump (loaded kernel image, * control pages, vmcoreinfo) * - MMIO/DMA pages. Some architectures don't allow to ioremap pages that are * not marked PG_reserved (as they might be in use by somebody else who does * not respect the caching strategy). * - Pages part of an offline section (struct pages of offline sections should * not be trusted as they will be initialized when first onlined). * - MCA pages on ia64 * - Pages holding CPU notes for POWER Firmware Assisted Dump * - Device memory (e.g. PMEM, DAX, HMM) * Some PG_reserved pages will be excluded from the hibernation image. * PG_reserved does in general not hinder anybody from dumping or swapping * and is no longer required for remap_pfn_range(). ioremap might require it. * Consequently, PG_reserved for a page mapped into user space can indicate * the zero page, the vDSO, MMIO pages or device memory. * * The PG_private bitflag is set on pagecache pages if they contain filesystem * specific data (which is normally at page->private). It can be used by * private allocations for its own usage. * * During initiation of disk I/O, PG_locked is set. This bit is set before I/O * and cleared when writeback _starts_ or when read _completes_. PG_writeback * is set before writeback starts and cleared when it finishes. * * PG_locked also pins a page in pagecache, and blocks truncation of the file * while it is held. * * page_waitqueue(page) is a wait queue of all tasks waiting for the page * to become unlocked. * * PG_swapbacked is set when a page uses swap as a backing storage. This are * usually PageAnon or shmem pages but please note that even anonymous pages * might lose their PG_swapbacked flag when they simply can be dropped (e.g. as * a result of MADV_FREE). * * PG_uptodate tells whether the page's contents is valid. When a read * completes, the page becomes uptodate, unless a disk I/O error happened. * * PG_referenced, PG_reclaim are used for page reclaim for anonymous and * file-backed pagecache (see mm/vmscan.c). * * PG_error is set to indicate that an I/O error occurred on this page. * * PG_arch_1 is an architecture specific page state bit. The generic code * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! */ /* * Don't use the *_dontuse flags. Use the macros. Otherwise you'll break * locked- and dirty-page accounting. * * The page flags field is split into two parts, the main flags area * which extends from the low bits upwards, and the fields area which * extends from the high bits downwards. * * | FIELD | ... | FLAGS | * N-1 ^ 0 * (NR_PAGEFLAGS) * * The fields area is reserved for fields mapping zone, node (for NUMA) and * SPARSEMEM section (for variants of SPARSEMEM that require section ids like * SPARSEMEM_EXTREME with !SPARSEMEM_VMEMMAP). */ enum pageflags { PG_locked, /* Page is locked. Don't touch. */ PG_referenced, PG_uptodate, PG_dirty, PG_lru, PG_active, PG_workingset, PG_waiters, /* Page has waiters, check its waitqueue. Must be bit #7 and in the same byte as "PG_locked" */ PG_error, PG_slab, PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/ PG_arch_1, PG_reserved, PG_private, /* If pagecache, has fs-private data */ PG_private_2, /* If pagecache, has fs aux data */ PG_writeback, /* Page is under writeback */ PG_head, /* A head page */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU PG_mlocked, /* Page is vma mlocked */ #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PG_uncached, /* Page has been mapped as uncached */ #endif #ifdef CONFIG_MEMORY_FAILURE PG_hwpoison, /* hardware poisoned page. Don't touch */ #endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) PG_young, PG_idle, #endif #ifdef CONFIG_64BIT PG_arch_2, #endif __NR_PAGEFLAGS, /* Filesystems */ PG_checked = PG_owner_priv_1, /* SwapBacked */ PG_swapcache = PG_owner_priv_1, /* Swap page: swp_entry_t in private */ /* Two page bits are conscripted by FS-Cache to maintain local caching * state. These bits are set on pages belonging to the netfs's inodes * when those inodes are being locally cached. */ PG_fscache = PG_private_2, /* page backed by cache */ /* XEN */ /* Pinned in Xen as a read-only pagetable page. */ PG_pinned = PG_owner_priv_1, /* Pinned as part of domain save (see xen_mm_pin_all()). */ PG_savepinned = PG_dirty, /* Has a grant mapping of another (foreign) domain's page. */ PG_foreign = PG_owner_priv_1, /* Remapped by swiotlb-xen. */ PG_xen_remapped = PG_owner_priv_1, /* SLOB */ PG_slob_free = PG_private, /* Compound pages. Stored in first tail page's flags */ PG_double_map = PG_workingset, /* non-lru isolated movable page */ PG_isolated = PG_reclaim, /* Only valid for buddy pages. Used to track pages that are reported */ PG_reported = PG_uptodate, }; #ifndef __GENERATING_BOUNDS_H struct page; /* forward declaration */ static inline struct page *compound_head(struct page *page) { unsigned long head = READ_ONCE(page->compound_head); if (unlikely(head & 1)) return (struct page *) (head - 1); return page; } static __always_inline int PageTail(struct page *page) { return READ_ONCE(page->compound_head) & 1; } static __always_inline int PageCompound(struct page *page) { return test_bit(PG_head, &page->flags) || PageTail(page); } #define PAGE_POISON_PATTERN -1l static inline int PagePoisoned(const struct page *page) { return page->flags == PAGE_POISON_PATTERN; } #ifdef CONFIG_DEBUG_VM void page_init_poison(struct page *page, size_t size); #else static inline void page_init_poison(struct page *page, size_t size) { } #endif /* * Page flags policies wrt compound pages * * PF_POISONED_CHECK * check if this struct page poisoned/uninitialized * * PF_ANY: * the page flag is relevant for small, head and tail pages. * * PF_HEAD: * for compound page all operations related to the page flag applied to * head page. * * PF_ONLY_HEAD: * for compound page, callers only ever operate on the head page. * * PF_NO_TAIL: * modifications of the page flag must be done on small or head pages, * checks can be done on tail pages too. * * PF_NO_COMPOUND: * the page flag is not relevant for compound pages. * * PF_SECOND: * the page flag is stored in the first tail page. */ #define PF_POISONED_CHECK(page) ({ \ VM_BUG_ON_PGFLAGS(PagePoisoned(page), page); \ page; }) #define PF_ANY(page, enforce) PF_POISONED_CHECK(page) #define PF_HEAD(page, enforce) PF_POISONED_CHECK(compound_head(page)) #define PF_ONLY_HEAD(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(PageTail(page), page); \ PF_POISONED_CHECK(page); }) #define PF_NO_TAIL(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageTail(page), page); \ PF_POISONED_CHECK(compound_head(page)); }) #define PF_NO_COMPOUND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(enforce && PageCompound(page), page); \ PF_POISONED_CHECK(page); }) #define PF_SECOND(page, enforce) ({ \ VM_BUG_ON_PGFLAGS(!PageHead(page), page); \ PF_POISONED_CHECK(&page[1]); }) /* * Macros to create function definitions for page flags */ #define TESTPAGEFLAG(uname, lname, policy) \ static __always_inline int Page##uname(struct page *page) \ { return test_bit(PG_##lname, &policy(page, 0)->flags); } #define SETPAGEFLAG(uname, lname, policy) \ static __always_inline void SetPage##uname(struct page *page) \ { set_bit(PG_##lname, &policy(page, 1)->flags); } #define CLEARPAGEFLAG(uname, lname, policy) \ static __always_inline void ClearPage##uname(struct page *page) \ { clear_bit(PG_##lname, &policy(page, 1)->flags); } #define __SETPAGEFLAG(uname, lname, policy) \ static __always_inline void __SetPage##uname(struct page *page) \ { __set_bit(PG_##lname, &policy(page, 1)->flags); } #define __CLEARPAGEFLAG(uname, lname, policy) \ static __always_inline void __ClearPage##uname(struct page *page) \ { __clear_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTSETFLAG(uname, lname, policy) \ static __always_inline int TestSetPage##uname(struct page *page) \ { return test_and_set_bit(PG_##lname, &policy(page, 1)->flags); } #define TESTCLEARFLAG(uname, lname, policy) \ static __always_inline int TestClearPage##uname(struct page *page) \ { return test_and_clear_bit(PG_##lname, &policy(page, 1)->flags); } #define PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ SETPAGEFLAG(uname, lname, policy) \ CLEARPAGEFLAG(uname, lname, policy) #define __PAGEFLAG(uname, lname, policy) \ TESTPAGEFLAG(uname, lname, policy) \ __SETPAGEFLAG(uname, lname, policy) \ __CLEARPAGEFLAG(uname, lname, policy) #define TESTSCFLAG(uname, lname, policy) \ TESTSETFLAG(uname, lname, policy) \ TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname) \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname) \ static inline void SetPage##uname(struct page *page) { } #define CLEARPAGEFLAG_NOOP(uname) \ static inline void ClearPage##uname(struct page *page) { } #define __CLEARPAGEFLAG_NOOP(uname) \ static inline void __ClearPage##uname(struct page *page) { } #define TESTSETFLAG_FALSE(uname) \ static inline int TestSetPage##uname(struct page *page) { return 0; } #define TESTCLEARFLAG_FALSE(uname) \ static inline int TestClearPage##uname(struct page *page) { return 0; } #define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \ SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname) #define TESTSCFLAG_FALSE(uname) \ TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname) __PAGEFLAG(Locked, locked, PF_NO_TAIL) PAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) __CLEARPAGEFLAG(Waiters, waiters, PF_ONLY_HEAD) PAGEFLAG(Error, error, PF_NO_TAIL) TESTCLEARFLAG(Error, error, PF_NO_TAIL) PAGEFLAG(Referenced, referenced, PF_HEAD) TESTCLEARFLAG(Referenced, referenced, PF_HEAD) __SETPAGEFLAG(Referenced, referenced, PF_HEAD) PAGEFLAG(Dirty, dirty, PF_HEAD) TESTSCFLAG(Dirty, dirty, PF_HEAD) __CLEARPAGEFLAG(Dirty, dirty, PF_HEAD) PAGEFLAG(LRU, lru, PF_HEAD) __CLEARPAGEFLAG(LRU, lru, PF_HEAD) PAGEFLAG(Active, active, PF_HEAD) __CLEARPAGEFLAG(Active, active, PF_HEAD) TESTCLEARFLAG(Active, active, PF_HEAD) PAGEFLAG(Workingset, workingset, PF_HEAD) TESTCLEARFLAG(Workingset, workingset, PF_HEAD) __PAGEFLAG(Slab, slab, PF_NO_TAIL) __PAGEFLAG(SlobFree, slob_free, PF_NO_TAIL) PAGEFLAG(Checked, checked, PF_NO_COMPOUND) /* Used by some filesystems */ /* Xen */ PAGEFLAG(Pinned, pinned, PF_NO_COMPOUND) TESTSCFLAG(Pinned, pinned, PF_NO_COMPOUND) PAGEFLAG(SavePinned, savepinned, PF_NO_COMPOUND); PAGEFLAG(Foreign, foreign, PF_NO_COMPOUND); PAGEFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) TESTCLEARFLAG(XenRemapped, xen_remapped, PF_NO_COMPOUND) PAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __CLEARPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) __SETPAGEFLAG(Reserved, reserved, PF_NO_COMPOUND) PAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __CLEARPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) __SETPAGEFLAG(SwapBacked, swapbacked, PF_NO_TAIL) /* * Private page markings that may be used by the filesystem that owns the page * for its own purposes. * - PG_private and PG_private_2 cause releasepage() and co to be invoked */ PAGEFLAG(Private, private, PF_ANY) __SETPAGEFLAG(Private, private, PF_ANY) __CLEARPAGEFLAG(Private, private, PF_ANY) PAGEFLAG(Private2, private_2, PF_ANY) TESTSCFLAG(Private2, private_2, PF_ANY) PAGEFLAG(OwnerPriv1, owner_priv_1, PF_ANY) TESTCLEARFLAG(OwnerPriv1, owner_priv_1, PF_ANY) /* * Only test-and-set exist for PG_writeback. The unconditional operators are * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback, PF_NO_TAIL) TESTSCFLAG(Writeback, writeback, PF_NO_TAIL) PAGEFLAG(MappedToDisk, mappedtodisk, PF_NO_TAIL) /* PG_readahead is only used for reads; PG_reclaim is only for writes */ PAGEFLAG(Reclaim, reclaim, PF_NO_TAIL) TESTCLEARFLAG(Reclaim, reclaim, PF_NO_TAIL) PAGEFLAG(Readahead, reclaim, PF_NO_COMPOUND) TESTCLEARFLAG(Readahead, reclaim, PF_NO_COMPOUND) #ifdef CONFIG_HIGHMEM /* * Must use a macro here due to header dependency issues. page_zone() is not * available at this point. */ #define PageHighMem(__p) is_highmem_idx(page_zonenum(__p)) #else PAGEFLAG_FALSE(HighMem) #endif #ifdef CONFIG_SWAP static __always_inline int PageSwapCache(struct page *page) { #ifdef CONFIG_THP_SWAP page = compound_head(page); #endif return PageSwapBacked(page) && test_bit(PG_swapcache, &page->flags); } SETPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) CLEARPAGEFLAG(SwapCache, swapcache, PF_NO_TAIL) #else PAGEFLAG_FALSE(SwapCache) #endif PAGEFLAG(Unevictable, unevictable, PF_HEAD) __CLEARPAGEFLAG(Unevictable, unevictable, PF_HEAD) TESTCLEARFLAG(Unevictable, unevictable, PF_HEAD) #ifdef CONFIG_MMU PAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) __CLEARPAGEFLAG(Mlocked, mlocked, PF_NO_TAIL) TESTSCFLAG(Mlocked, mlocked, PF_NO_TAIL) #else PAGEFLAG_FALSE(Mlocked) __CLEARPAGEFLAG_NOOP(Mlocked) TESTSCFLAG_FALSE(Mlocked) #endif #ifdef CONFIG_ARCH_USES_PG_UNCACHED PAGEFLAG(Uncached, uncached, PF_NO_COMPOUND) #else PAGEFLAG_FALSE(Uncached) #endif #ifdef CONFIG_MEMORY_FAILURE PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) extern bool take_page_off_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison) #define __PG_HWPOISON 0 #endif #if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT) TESTPAGEFLAG(Young, young, PF_ANY) SETPAGEFLAG(Young, young, PF_ANY) TESTCLEARFLAG(Young, young, PF_ANY) PAGEFLAG(Idle, idle, PF_ANY) #endif /* * PageReported() is used to track reported free pages within the Buddy * allocator. We can use the non-atomic version of the test and set * operations as both should be shielded with the zone lock to prevent * any possible races on the setting or clearing of the bit. */ __PAGEFLAG(Reported, reported, PF_NO_COMPOUND) /* * On an anonymous page mapped into a user virtual memory area, * page->mapping points to its anon_vma, not to a struct address_space; * with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h. * * On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled, * the PAGE_MAPPING_MOVABLE bit may be set along with the PAGE_MAPPING_ANON * bit; and then page->mapping points, not to an anon_vma, but to a private * structure which KSM associates with that merged page. See ksm.h. * * PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is used for non-lru movable * page and then page->mapping points a struct address_space. * * Please note that, confusingly, "page_mapping" refers to the inode * address_space which maps the page from disk; whereas "page_mapped" * refers to user virtual address space into which the page is mapped. */ #define PAGE_MAPPING_ANON 0x1 #define PAGE_MAPPING_MOVABLE 0x2 #define PAGE_MAPPING_KSM (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) #define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_MOVABLE) static __always_inline int PageMappingFlags(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) != 0; } static __always_inline int PageAnon(struct page *page) { page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; } static __always_inline int __PageMovable(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_MOVABLE; } #ifdef CONFIG_KSM /* * A KSM page is one of those write-protected "shared pages" or "merged pages" * which KSM maps into multiple mms, wherever identical anonymous page content * is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any * anon_vma, but to that page's node of the stable tree. */ static __always_inline int PageKsm(struct page *page) { page = compound_head(page); return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } #else TESTPAGEFLAG_FALSE(Ksm) #endif u64 stable_page_flags(struct page *page); static inline int PageUptodate(struct page *page) { int ret; page = compound_head(page); ret = test_bit(PG_uptodate, &(page)->flags); /* * Must ensure that the data we read out of the page is loaded * _after_ we've loaded page->flags to check for PageUptodate. * We can skip the barrier if the page is not uptodate, because * we wouldn't be reading anything from it. * * See SetPageUptodate() for the other side of the story. */ if (ret) smp_rmb(); return ret; } static __always_inline void __SetPageUptodate(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); smp_wmb(); __set_bit(PG_uptodate, &page->flags); } static __always_inline void SetPageUptodate(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); /* * Memory barrier must be issued before setting the PG_uptodate bit, * so that all previous stores issued in order to bring the page * uptodate are actually visible before PageUptodate becomes true. */ smp_wmb(); set_bit(PG_uptodate, &page->flags); } CLEARPAGEFLAG(Uptodate, uptodate, PF_NO_TAIL) int test_clear_page_writeback(struct page *page); int __test_set_page_writeback(struct page *page, bool keep_write); #define test_set_page_writeback(page) \ __test_set_page_writeback(page, false) #define test_set_page_writeback_keepwrite(page) \ __test_set_page_writeback(page, true) static inline void set_page_writeback(struct page *page) { test_set_page_writeback(page); } static inline void set_page_writeback_keepwrite(struct page *page) { test_set_page_writeback_keepwrite(page); } __PAGEFLAG(Head, head, PF_ANY) CLEARPAGEFLAG(Head, head, PF_ANY) static __always_inline void set_compound_head(struct page *page, struct page *head) { WRITE_ONCE(page->compound_head, (unsigned long)head + 1); } static __always_inline void clear_compound_head(struct page *page) { WRITE_ONCE(page->compound_head, 0); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE static inline void ClearPageCompound(struct page *page) { BUG_ON(!PageHead(page)); ClearPageHead(page); } #endif #define PG_head_mask ((1UL << PG_head)) #ifdef CONFIG_HUGETLB_PAGE int PageHuge(struct page *page); int PageHeadHuge(struct page *page); bool page_huge_active(struct page *page); #else TESTPAGEFLAG_FALSE(Huge) TESTPAGEFLAG_FALSE(HeadHuge) static inline bool page_huge_active(struct page *page) { return 0; } #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* * PageHuge() only returns true for hugetlbfs pages, but not for * normal or transparent huge pages. * * PageTransHuge() returns true for both transparent huge and * hugetlbfs pages, but not normal pages. PageTransHuge() can only be * called only in the core VM paths where hugetlbfs pages can't exist. */ static inline int PageTransHuge(struct page *page) { VM_BUG_ON_PAGE(PageTail(page), page); return PageHead(page); } /* * PageTransCompound returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransCompound(struct page *page) { return PageCompound(page); } /* * PageTransCompoundMap is the same as PageTransCompound, but it also * guarantees the primary MMU has the entire compound page mapped * through pmd_trans_huge, which in turn guarantees the secondary MMUs * can also map the entire compound page. This allows the secondary * MMUs to call get_user_pages() only once for each compound page and * to immediately map the entire compound page with a single secondary * MMU fault. If there will be a pmd split later, the secondary MMUs * will get an update through the MMU notifier invalidation through * split_huge_pmd(). * * Unlike PageTransCompound, this is safe to be called only while * split_huge_pmd() cannot run from under us, like if protected by the * MMU notifier, otherwise it may result in page->_mapcount check false * positives. * * We have to treat page cache THP differently since every subpage of it * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE * mapped in the current process so comparing subpage's _mapcount to * compound_mapcount to filter out PTE mapped case. */ static inline int PageTransCompoundMap(struct page *page) { struct page *head; if (!PageTransCompound(page)) return 0; if (PageAnon(page)) return atomic_read(&page->_mapcount) < 0; head = compound_head(page); /* File THP is PMD mapped and not PTE mapped */ return atomic_read(&page->_mapcount) == atomic_read(compound_mapcount_ptr(head)); } /* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known * that hugetlbfs pages aren't involved. */ static inline int PageTransTail(struct page *page) { return PageTail(page); } /* * PageDoubleMap indicates that the compound page is mapped with PTEs as well * as PMDs. * * This is required for optimization of rmap operations for THP: we can postpone * per small page mapcount accounting (and its overhead from atomic operations) * until the first PMD split. * * For the page PageDoubleMap means ->_mapcount in all sub-pages is offset up * by one. This reference will go away with last compound_mapcount. * * See also __split_huge_pmd_locked() and page_remove_anon_compound_rmap(). */ PAGEFLAG(DoubleMap, double_map, PF_SECOND) TESTSCFLAG(DoubleMap, double_map, PF_SECOND) #else TESTPAGEFLAG_FALSE(TransHuge) TESTPAGEFLAG_FALSE(TransCompound) TESTPAGEFLAG_FALSE(TransCompoundMap) TESTPAGEFLAG_FALSE(TransTail) PAGEFLAG_FALSE(DoubleMap) TESTSCFLAG_FALSE(DoubleMap) #endif /* * For pages that are never mapped to userspace (and aren't PageSlab), * page_type may be used. Because it is initialised to -1, we invert the * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and * low bits so that an underflow or overflow of page_mapcount() won't be * mistaken for a page type value. */ #define PAGE_TYPE_BASE 0xf0000000 /* Reserve 0x0000007f to catch underflows of page_mapcount */ #define PAGE_MAPCOUNT_RESERVE -128 #define PG_buddy 0x00000080 #define PG_offline 0x00000100 #define PG_kmemcg 0x00000200 #define PG_table 0x00000400 #define PG_guard 0x00000800 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) static inline int page_has_type(struct page *page) { return (int)page->page_type < PAGE_MAPCOUNT_RESERVE; } #define PAGE_TYPE_OPS(uname, lname) \ static __always_inline int Page##uname(struct page *page) \ { \ return PageType(page, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!PageType(page, 0), page); \ page->page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ page->page_type |= PG_##lname; \ } /* * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ PAGE_TYPE_OPS(Buddy, buddy) /* * PageOffline() indicates that the page is logically offline although the * containing section is online. (e.g. inflated in a balloon driver or * not onlined when onlining the section). * The content of these pages is effectively stale. Such pages should not * be touched (read/write/dump/save) except by their owner. * * If a driver wants to allow to offline unmovable PageOffline() pages without * putting them back to the buddy, it can do so via the memory notifier by * decrementing the reference count in MEM_GOING_OFFLINE and incrementing the * reference count in MEM_CANCEL_OFFLINE. When offlining, the PageOffline() * pages (now with a reference count of zero) are treated like free pages, * allowing the containing memory block to get offlined. A driver that * relies on this feature is aware that re-onlining the memory block will * require to re-set the pages PageOffline() and not giving them to the * buddy via online_page_callback_t. */ PAGE_TYPE_OPS(Offline, offline) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. */ PAGE_TYPE_OPS(Kmemcg, kmemcg) /* * Marks pages in use as page tables. */ PAGE_TYPE_OPS(Table, table) /* * Marks guardpages used with debug_pagealloc. */ PAGE_TYPE_OPS(Guard, guard) extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); /* * If network-based swap is enabled, sl*b must keep track of whether pages * were allocated from pfmemalloc reserves. */ static inline int PageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); return PageActive(page); } static inline void SetPageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); SetPageActive(page); } static inline void __ClearPageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); __ClearPageActive(page); } static inline void ClearPageSlabPfmemalloc(struct page *page) { VM_BUG_ON_PAGE(!PageSlab(page), page); ClearPageActive(page); } #ifdef CONFIG_MMU #define __PG_MLOCKED (1UL << PG_mlocked) #else #define __PG_MLOCKED 0 #endif /* * Flags checked when a page is freed. Pages being freed should not have * these flags set. It they are, there is a problem. */ #define PAGE_FLAGS_CHECK_AT_FREE \ (1UL << PG_lru | 1UL << PG_locked | \ 1UL << PG_private | 1UL << PG_private_2 | \ 1UL << PG_writeback | 1UL << PG_reserved | \ 1UL << PG_slab | 1UL << PG_active | \ 1UL << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. * Pages being prepped should not have these flags set. It they are set, * there has been a kernel bug or struct page corruption. * * __PG_HWPOISON is exceptional because it needs to be kept beyond page's * alloc-free cycle to prevent from reusing the page. */ #define PAGE_FLAGS_CHECK_AT_PREP \ (((1UL << NR_PAGEFLAGS) - 1) & ~__PG_HWPOISON) #define PAGE_FLAGS_PRIVATE \ (1UL << PG_private | 1UL << PG_private_2) /** * page_has_private - Determine if page has private stuff * @page: The page to be checked * * Determine if a page has private stuff, indicating that release routines * should be invoked upon it. */ static inline int page_has_private(struct page *page) { return !!(page->flags & PAGE_FLAGS_PRIVATE); } #undef PF_ANY #undef PF_HEAD #undef PF_ONLY_HEAD #undef PF_NO_TAIL #undef PF_NO_COMPOUND #undef PF_SECOND #endif /* !__GENERATING_BOUNDS_H */ #endif /* PAGE_FLAGS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 // SPDX-License-Identifier: GPL-2.0 /* * linux/fs/ext4/truncate.h * * Common inline functions needed for truncate support */ /* * Truncate blocks that were not used by write. We have to truncate the * pagecache as well so that corresponding buffers get properly unmapped. */ static inline void ext4_truncate_failed_write(struct inode *inode) { /* * We don't need to call ext4_break_layouts() because the blocks we * are truncating were never visible to userspace. */ down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); up_write(&EXT4_I(inode)->i_mmap_sem); } /* * Work out how many blocks we need to proceed with the next chunk of a * truncate transaction. */ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode) { ext4_lblk_t needed; needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9); /* Give ourselves just enough room to cope with inodes in which * i_blocks is corrupt: we've seen disk corruptions in the past * which resulted in random data in an inode which looked enough * like a regular file for ext4 to try to delete it. Things * will go a bit crazy if that happens, but at least we should * try not to panic the whole kernel. */ if (needed < 2) needed = 2; /* But we need to bound the transaction so we don't overflow the * journal. */ if (needed > EXT4_MAX_TRANS_DATA) needed = EXT4_MAX_TRANS_DATA; return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM exceptions #if !defined(_TRACE_PAGE_FAULT_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_PAGE_FAULT_H #include <linux/tracepoint.h> #include <asm/trace/common.h> extern int trace_pagefault_reg(void); extern void trace_pagefault_unreg(void); DECLARE_EVENT_CLASS(x86_exceptions, TP_PROTO(unsigned long address, struct pt_regs *regs, unsigned long error_code), TP_ARGS(address, regs, error_code), TP_STRUCT__entry( __field( unsigned long, address ) __field( unsigned long, ip ) __field( unsigned long, error_code ) ), TP_fast_assign( __entry->address = address; __entry->ip = regs->ip; __entry->error_code = error_code; ), TP_printk("address=%ps ip=%ps error_code=0x%lx", (void *)__entry->address, (void *)__entry->ip, __entry->error_code) ); #define DEFINE_PAGE_FAULT_EVENT(name) \ DEFINE_EVENT_FN(x86_exceptions, name, \ TP_PROTO(unsigned long address, struct pt_regs *regs, \ unsigned long error_code), \ TP_ARGS(address, regs, error_code), \ trace_pagefault_reg, trace_pagefault_unreg); DEFINE_PAGE_FAULT_EVENT(page_fault_user); DEFINE_PAGE_FAULT_EVENT(page_fault_kernel); #undef TRACE_INCLUDE_PATH #undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH . #define TRACE_INCLUDE_FILE exceptions #endif /* _TRACE_PAGE_FAULT_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM io_uring #if !defined(_TRACE_IO_URING_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IO_URING_H #include <linux/tracepoint.h> struct io_wq_work; /** * io_uring_create - called after a new io_uring context was prepared * * @fd: corresponding file descriptor * @ctx: pointer to a ring context structure * @sq_entries: actual SQ size * @cq_entries: actual CQ size * @flags: SQ ring flags, provided to io_uring_setup(2) * * Allows to trace io_uring creation and provide pointer to a context, that can * be used later to find correlated events. */ TRACE_EVENT(io_uring_create, TP_PROTO(int fd, void *ctx, u32 sq_entries, u32 cq_entries, u32 flags), TP_ARGS(fd, ctx, sq_entries, cq_entries, flags), TP_STRUCT__entry ( __field( int, fd ) __field( void *, ctx ) __field( u32, sq_entries ) __field( u32, cq_entries ) __field( u32, flags ) ), TP_fast_assign( __entry->fd = fd; __entry->ctx = ctx; __entry->sq_entries = sq_entries; __entry->cq_entries = cq_entries; __entry->flags = flags; ), TP_printk("ring %p, fd %d sq size %d, cq size %d, flags %d", __entry->ctx, __entry->fd, __entry->sq_entries, __entry->cq_entries, __entry->flags) ); /** * io_uring_register - called after a buffer/file/eventfd was succesfully * registered for a ring * * @ctx: pointer to a ring context structure * @opcode: describes which operation to perform * @nr_user_files: number of registered files * @nr_user_bufs: number of registered buffers * @cq_ev_fd: whether eventfs registered or not * @ret: return code * * Allows to trace fixed files/buffers/eventfds, that could be registered to * avoid an overhead of getting references to them for every operation. This * event, together with io_uring_file_get, can provide a full picture of how * much overhead one can reduce via fixing. */ TRACE_EVENT(io_uring_register, TP_PROTO(void *ctx, unsigned opcode, unsigned nr_files, unsigned nr_bufs, bool eventfd, long ret), TP_ARGS(ctx, opcode, nr_files, nr_bufs, eventfd, ret), TP_STRUCT__entry ( __field( void *, ctx ) __field( unsigned, opcode ) __field( unsigned, nr_files ) __field( unsigned, nr_bufs ) __field( bool, eventfd ) __field( long, ret ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->nr_files = nr_files; __entry->nr_bufs = nr_bufs; __entry->eventfd = eventfd; __entry->ret = ret; ), TP_printk("ring %p, opcode %d, nr_user_files %d, nr_user_bufs %d, " "eventfd %d, ret %ld", __entry->ctx, __entry->opcode, __entry->nr_files, __entry->nr_bufs, __entry->eventfd, __entry->ret) ); /** * io_uring_file_get - called before getting references to an SQE file * * @ctx: pointer to a ring context structure * @fd: SQE file descriptor * * Allows to trace out how often an SQE file reference is obtained, which can * help figuring out if it makes sense to use fixed files, or check that fixed * files are used correctly. */ TRACE_EVENT(io_uring_file_get, TP_PROTO(void *ctx, int fd), TP_ARGS(ctx, fd), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, fd ) ), TP_fast_assign( __entry->ctx = ctx; __entry->fd = fd; ), TP_printk("ring %p, fd %d", __entry->ctx, __entry->fd) ); /** * io_uring_queue_async_work - called before submitting a new async work * * @ctx: pointer to a ring context structure * @hashed: type of workqueue, hashed or normal * @req: pointer to a submitted request * @work: pointer to a submitted io_wq_work * * Allows to trace asynchronous work submission. */ TRACE_EVENT(io_uring_queue_async_work, TP_PROTO(void *ctx, int rw, void * req, struct io_wq_work *work, unsigned int flags), TP_ARGS(ctx, rw, req, work, flags), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, rw ) __field( void *, req ) __field( struct io_wq_work *, work ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->ctx = ctx; __entry->rw = rw; __entry->req = req; __entry->work = work; __entry->flags = flags; ), TP_printk("ring %p, request %p, flags %d, %s queue, work %p", __entry->ctx, __entry->req, __entry->flags, __entry->rw ? "hashed" : "normal", __entry->work) ); /** * io_uring_defer - called when an io_uring request is deferred * * @ctx: pointer to a ring context structure * @req: pointer to a deferred request * @user_data: user data associated with the request * * Allows to track deferred requests, to get an insight about what requests are * not started immediately. */ TRACE_EVENT(io_uring_defer, TP_PROTO(void *ctx, void *req, unsigned long long user_data), TP_ARGS(ctx, req, user_data), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( unsigned long long, data ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->data = user_data; ), TP_printk("ring %p, request %p user_data %llu", __entry->ctx, __entry->req, __entry->data) ); /** * io_uring_link - called before the io_uring request added into link_list of * another request * * @ctx: pointer to a ring context structure * @req: pointer to a linked request * @target_req: pointer to a previous request, that would contain @req * * Allows to track linked requests, to understand dependencies between requests * and how does it influence their execution flow. */ TRACE_EVENT(io_uring_link, TP_PROTO(void *ctx, void *req, void *target_req), TP_ARGS(ctx, req, target_req), TP_STRUCT__entry ( __field( void *, ctx ) __field( void *, req ) __field( void *, target_req ) ), TP_fast_assign( __entry->ctx = ctx; __entry->req = req; __entry->target_req = target_req; ), TP_printk("ring %p, request %p linked after %p", __entry->ctx, __entry->req, __entry->target_req) ); /** * io_uring_cqring_wait - called before start waiting for an available CQE * * @ctx: pointer to a ring context structure * @min_events: minimal number of events to wait for * * Allows to track waiting for CQE, so that we can e.g. troubleshoot * situations, when an application wants to wait for an event, that never * comes. */ TRACE_EVENT(io_uring_cqring_wait, TP_PROTO(void *ctx, int min_events), TP_ARGS(ctx, min_events), TP_STRUCT__entry ( __field( void *, ctx ) __field( int, min_events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->min_events = min_events; ), TP_printk("ring %p, min_events %d", __entry->ctx, __entry->min_events) ); /** * io_uring_fail_link - called before failing a linked request * * @req: request, which links were cancelled * @link: cancelled link * * Allows to track linked requests cancellation, to see not only that some work * was cancelled, but also which request was the reason. */ TRACE_EVENT(io_uring_fail_link, TP_PROTO(void *req, void *link), TP_ARGS(req, link), TP_STRUCT__entry ( __field( void *, req ) __field( void *, link ) ), TP_fast_assign( __entry->req = req; __entry->link = link; ), TP_printk("request %p, link %p", __entry->req, __entry->link) ); /** * io_uring_complete - called when completing an SQE * * @ctx: pointer to a ring context structure * @user_data: user data associated with the request * @res: result of the request * */ TRACE_EVENT(io_uring_complete, TP_PROTO(void *ctx, u64 user_data, long res), TP_ARGS(ctx, user_data, res), TP_STRUCT__entry ( __field( void *, ctx ) __field( u64, user_data ) __field( long, res ) ), TP_fast_assign( __entry->ctx = ctx; __entry->user_data = user_data; __entry->res = res; ), TP_printk("ring %p, user_data 0x%llx, result %ld", __entry->ctx, (unsigned long long)__entry->user_data, __entry->res) ); /** * io_uring_submit_sqe - called before submitting one SQE * * @ctx: pointer to a ring context structure * @opcode: opcode of request * @user_data: user data associated with the request * @force_nonblock: whether a context blocking or not * @sq_thread: true if sq_thread has submitted this SQE * * Allows to track SQE submitting, to understand what was the source of it, SQ * thread or io_uring_enter call. */ TRACE_EVENT(io_uring_submit_sqe, TP_PROTO(void *ctx, u8 opcode, u64 user_data, bool force_nonblock, bool sq_thread), TP_ARGS(ctx, opcode, user_data, force_nonblock, sq_thread), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( bool, force_nonblock ) __field( bool, sq_thread ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->force_nonblock = force_nonblock; __entry->sq_thread = sq_thread; ), TP_printk("ring %p, op %d, data 0x%llx, non block %d, sq_thread %d", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->force_nonblock, __entry->sq_thread) ); TRACE_EVENT(io_uring_poll_arm, TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask, int events), TP_ARGS(ctx, opcode, user_data, mask, events), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( int, mask ) __field( int, events ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->mask = mask; __entry->events = events; ), TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x, events 0x%x", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->mask, __entry->events) ); TRACE_EVENT(io_uring_poll_wake, TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), TP_ARGS(ctx, opcode, user_data, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( int, mask ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->mask = mask; ), TP_printk("ring %p, op %d, data 0x%llx, mask 0x%x", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->mask) ); TRACE_EVENT(io_uring_task_add, TP_PROTO(void *ctx, u8 opcode, u64 user_data, int mask), TP_ARGS(ctx, opcode, user_data, mask), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) __field( int, mask ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; __entry->mask = mask; ), TP_printk("ring %p, op %d, data 0x%llx, mask %x", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data, __entry->mask) ); TRACE_EVENT(io_uring_task_run, TP_PROTO(void *ctx, u8 opcode, u64 user_data), TP_ARGS(ctx, opcode, user_data), TP_STRUCT__entry ( __field( void *, ctx ) __field( u8, opcode ) __field( u64, user_data ) ), TP_fast_assign( __entry->ctx = ctx; __entry->opcode = opcode; __entry->user_data = user_data; ), TP_printk("ring %p, op %d, data 0x%llx", __entry->ctx, __entry->opcode, (unsigned long long) __entry->user_data) ); #endif /* _TRACE_IO_URING_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 /* SPDX-License-Identifier: GPL-2.0 */ /* * RT Mutexes: blocking mutual exclusion locks with PI support * * started by Ingo Molnar and Thomas Gleixner: * * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * This file contains the private data structure and API definitions. */ #ifndef __KERNEL_RTMUTEX_COMMON_H #define __KERNEL_RTMUTEX_COMMON_H #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * * @tree_entry: pi node to enqueue into the mutex waiters tree * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { struct rb_node tree_entry; struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES unsigned long ip; struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif int prio; u64 deadline; }; /* * Various helpers to access the waiters-tree: */ #ifdef CONFIG_RT_MUTEXES static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; if (leftmost) { w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); } return w; } static inline int task_has_pi_waiters(struct task_struct *p) { return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, pi_tree_entry); } #else static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return false; } static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { return NULL; } static inline int task_has_pi_waiters(struct task_struct *p) { return false; } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { return NULL; } #endif /* * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } /* * Constants for rt mutex functions which have a selectable deadlock * detection. * * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are * no further PI adjustments to be made. * * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full * walk of the lock chain. */ enum rtmutex_chainwalk { RT_MUTEX_MIN_CHAINWALK, RT_MUTEX_FULL_CHAINWALK, }; /* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wqh); extern void rt_mutex_postunlock(struct wake_q_head *wake_q); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" #else # include "rtmutex.h" #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_AEAD_H #define _CRYPTO_AEAD_H #include <linux/crypto.h> #include <linux/kernel.h> #include <linux/slab.h> /** * DOC: Authenticated Encryption With Associated Data (AEAD) Cipher API * * The AEAD cipher API is used with the ciphers of type CRYPTO_ALG_TYPE_AEAD * (listed as type "aead" in /proc/crypto) * * The most prominent examples for this type of encryption is GCM and CCM. * However, the kernel supports other types of AEAD ciphers which are defined * with the following cipher string: * * authenc(keyed message digest, block cipher) * * For example: authenc(hmac(sha256), cbc(aes)) * * The example code provided for the symmetric key cipher operation * applies here as well. Naturally all *skcipher* symbols must be exchanged * the *aead* pendants discussed in the following. In addition, for the AEAD * operation, the aead_request_set_ad function must be used to set the * pointer to the associated data memory location before performing the * encryption or decryption operation. In case of an encryption, the associated * data memory is filled during the encryption operation. For decryption, the * associated data memory must contain data that is used to verify the integrity * of the decrypted data. Another deviation from the asynchronous block cipher * operation is that the caller should explicitly check for -EBADMSG of the * crypto_aead_decrypt. That error indicates an authentication error, i.e. * a breach in the integrity of the message. In essence, that -EBADMSG error * code is the key bonus an AEAD cipher has over "standard" block chaining * modes. * * Memory Structure: * * The source scatterlist must contain the concatenation of * associated data || plaintext or ciphertext. * * The destination scatterlist has the same layout, except that the plaintext * (resp. ciphertext) will grow (resp. shrink) by the authentication tag size * during encryption (resp. decryption). * * In-place encryption/decryption is enabled by using the same scatterlist * pointer for both the source and destination. * * Even in the out-of-place case, space must be reserved in the destination for * the associated data, even though it won't be written to. This makes the * in-place and out-of-place cases more consistent. It is permissible for the * "destination" associated data to alias the "source" associated data. * * As with the other scatterlist crypto APIs, zero-length scatterlist elements * are not allowed in the used part of the scatterlist. Thus, if there is no * associated data, the first element must point to the plaintext/ciphertext. * * To meet the needs of IPsec, a special quirk applies to rfc4106, rfc4309, * rfc4543, and rfc7539esp ciphers. For these ciphers, the final 'ivsize' bytes * of the associated data buffer must contain a second copy of the IV. This is * in addition to the copy passed to aead_request_set_crypt(). These two IV * copies must not differ; different implementations of the same algorithm may * behave differently in that case. Note that the algorithm might not actually * treat the IV as associated data; nevertheless the length passed to * aead_request_set_ad() must include it. */ struct crypto_aead; /** * struct aead_request - AEAD request * @base: Common attributes for async crypto requests * @assoclen: Length in bytes of associated data for authentication * @cryptlen: Length of data to be encrypted or decrypted * @iv: Initialisation vector * @src: Source data * @dst: Destination data * @__ctx: Start of private context data */ struct aead_request { struct crypto_async_request base; unsigned int assoclen; unsigned int cryptlen; u8 *iv; struct scatterlist *src; struct scatterlist *dst; void *__ctx[] CRYPTO_MINALIGN_ATTR; }; /** * struct aead_alg - AEAD cipher definition * @maxauthsize: Set the maximum authentication tag size supported by the * transformation. A transformation may support smaller tag sizes. * As the authentication tag is a message digest to ensure the * integrity of the encrypted data, a consumer typically wants the * largest authentication tag possible as defined by this * variable. * @setauthsize: Set authentication size for the AEAD transformation. This * function is used to specify the consumer requested size of the * authentication tag to be either generated by the transformation * during encryption or the size of the authentication tag to be * supplied during the decryption operation. This function is also * responsible for checking the authentication tag size for * validity. * @setkey: see struct skcipher_alg * @encrypt: see struct skcipher_alg * @decrypt: see struct skcipher_alg * @ivsize: see struct skcipher_alg * @chunksize: see struct skcipher_alg * @init: Initialize the cryptographic transformation object. This function * is used to initialize the cryptographic transformation object. * This function is called only once at the instantiation time, right * after the transformation context was allocated. In case the * cryptographic hardware has some special requirements which need to * be handled by software, this function shall check for the precise * requirement of the transformation and put any software fallbacks * in place. * @exit: Deinitialize the cryptographic transformation object. This is a * counterpart to @init, used to remove various changes set in * @init. * @base: Definition of a generic crypto cipher algorithm. * * All fields except @ivsize is mandatory and must be filled. */ struct aead_alg { int (*setkey)(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); int (*setauthsize)(struct crypto_aead *tfm, unsigned int authsize); int (*encrypt)(struct aead_request *req); int (*decrypt)(struct aead_request *req); int (*init)(struct crypto_aead *tfm); void (*exit)(struct crypto_aead *tfm); unsigned int ivsize; unsigned int maxauthsize; unsigned int chunksize; struct crypto_alg base; }; struct crypto_aead { unsigned int authsize; unsigned int reqsize; struct crypto_tfm base; }; static inline struct crypto_aead *__crypto_aead_cast(struct crypto_tfm *tfm) { return container_of(tfm, struct crypto_aead, base); } /** * crypto_alloc_aead() - allocate AEAD cipher handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * AEAD cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for an AEAD. The returned struct * crypto_aead is the cipher handle that is required for any subsequent * API invocation for that AEAD. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_aead *crypto_alloc_aead(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_aead_tfm(struct crypto_aead *tfm) { return &tfm->base; } /** * crypto_free_aead() - zeroize and free aead handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_aead(struct crypto_aead *tfm) { crypto_destroy_tfm(tfm, crypto_aead_tfm(tfm)); } static inline struct aead_alg *crypto_aead_alg(struct crypto_aead *tfm) { return container_of(crypto_aead_tfm(tfm)->__crt_alg, struct aead_alg, base); } static inline unsigned int crypto_aead_alg_ivsize(struct aead_alg *alg) { return alg->ivsize; } /** * crypto_aead_ivsize() - obtain IV size * @tfm: cipher handle * * The size of the IV for the aead referenced by the cipher handle is * returned. This IV size may be zero if the cipher does not need an IV. * * Return: IV size in bytes */ static inline unsigned int crypto_aead_ivsize(struct crypto_aead *tfm) { return crypto_aead_alg_ivsize(crypto_aead_alg(tfm)); } /** * crypto_aead_authsize() - obtain maximum authentication data size * @tfm: cipher handle * * The maximum size of the authentication data for the AEAD cipher referenced * by the AEAD cipher handle is returned. The authentication data size may be * zero if the cipher implements a hard-coded maximum. * * The authentication data may also be known as "tag value". * * Return: authentication data size / tag size in bytes */ static inline unsigned int crypto_aead_authsize(struct crypto_aead *tfm) { return tfm->authsize; } static inline unsigned int crypto_aead_alg_maxauthsize(struct aead_alg *alg) { return alg->maxauthsize; } static inline unsigned int crypto_aead_maxauthsize(struct crypto_aead *aead) { return crypto_aead_alg_maxauthsize(crypto_aead_alg(aead)); } /** * crypto_aead_blocksize() - obtain block size of cipher * @tfm: cipher handle * * The block size for the AEAD referenced with the cipher handle is returned. * The caller may use that information to allocate appropriate memory for the * data returned by the encryption or decryption operation * * Return: block size of cipher */ static inline unsigned int crypto_aead_blocksize(struct crypto_aead *tfm) { return crypto_tfm_alg_blocksize(crypto_aead_tfm(tfm)); } static inline unsigned int crypto_aead_alignmask(struct crypto_aead *tfm) { return crypto_tfm_alg_alignmask(crypto_aead_tfm(tfm)); } static inline u32 crypto_aead_get_flags(struct crypto_aead *tfm) { return crypto_tfm_get_flags(crypto_aead_tfm(tfm)); } static inline void crypto_aead_set_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_set_flags(crypto_aead_tfm(tfm), flags); } static inline void crypto_aead_clear_flags(struct crypto_aead *tfm, u32 flags) { crypto_tfm_clear_flags(crypto_aead_tfm(tfm), flags); } /** * crypto_aead_setkey() - set key for cipher * @tfm: cipher handle * @key: buffer holding the key * @keylen: length of the key in bytes * * The caller provided key is set for the AEAD referenced by the cipher * handle. * * Note, the key length determines the cipher type. Many block ciphers implement * different cipher modes depending on the key size, such as AES-128 vs AES-192 * vs. AES-256. When providing a 16 byte key for an AES cipher handle, AES-128 * is performed. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setkey(struct crypto_aead *tfm, const u8 *key, unsigned int keylen); /** * crypto_aead_setauthsize() - set authentication data size * @tfm: cipher handle * @authsize: size of the authentication data / tag in bytes * * Set the authentication data size / tag size. AEAD requires an authentication * tag (or MAC) in addition to the associated data. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_aead_setauthsize(struct crypto_aead *tfm, unsigned int authsize); static inline struct crypto_aead *crypto_aead_reqtfm(struct aead_request *req) { return __crypto_aead_cast(req->base.tfm); } /** * crypto_aead_encrypt() - encrypt plaintext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Encrypt plaintext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The encryption operation creates the authentication data / * tag. That data is concatenated with the created ciphertext. * The ciphertext memory size is therefore the given number of * block cipher blocks + the size defined by the * crypto_aead_setauthsize invocation. The caller must ensure * that sufficient memory is available for the ciphertext and * the authentication tag. * * Return: 0 if the cipher operation was successful; < 0 if an error occurred */ int crypto_aead_encrypt(struct aead_request *req); /** * crypto_aead_decrypt() - decrypt ciphertext * @req: reference to the aead_request handle that holds all information * needed to perform the cipher operation * * Decrypt ciphertext data using the aead_request handle. That data structure * and how it is filled with data is discussed with the aead_request_* * functions. * * IMPORTANT NOTE The caller must concatenate the ciphertext followed by the * authentication data / tag. That authentication data / tag * must have the size defined by the crypto_aead_setauthsize * invocation. * * * Return: 0 if the cipher operation was successful; -EBADMSG: The AEAD * cipher operation performs the authentication of the data during the * decryption operation. Therefore, the function returns this error if * the authentication of the ciphertext was unsuccessful (i.e. the * integrity of the ciphertext or the associated data was violated); * < 0 if an error occurred. */ int crypto_aead_decrypt(struct aead_request *req); /** * DOC: Asynchronous AEAD Request Handle * * The aead_request data structure contains all pointers to data required for * the AEAD cipher operation. This includes the cipher handle (which can be * used by multiple aead_request instances), pointer to plaintext and * ciphertext, asynchronous callback function, etc. It acts as a handle to the * aead_request_* API calls in a similar way as AEAD handle to the * crypto_aead_* API calls. */ /** * crypto_aead_reqsize() - obtain size of the request data structure * @tfm: cipher handle * * Return: number of bytes */ static inline unsigned int crypto_aead_reqsize(struct crypto_aead *tfm) { return tfm->reqsize; } /** * aead_request_set_tfm() - update cipher handle reference in request * @req: request handle to be modified * @tfm: cipher handle that shall be added to the request handle * * Allow the caller to replace the existing aead handle in the request * data structure with a different one. */ static inline void aead_request_set_tfm(struct aead_request *req, struct crypto_aead *tfm) { req->base.tfm = crypto_aead_tfm(tfm); } /** * aead_request_alloc() - allocate request data structure * @tfm: cipher handle to be registered with the request * @gfp: memory allocation flag that is handed to kmalloc by the API call. * * Allocate the request data structure that must be used with the AEAD * encrypt and decrypt API calls. During the allocation, the provided aead * handle is registered in the request data structure. * * Return: allocated request handle in case of success, or NULL if out of memory */ static inline struct aead_request *aead_request_alloc(struct crypto_aead *tfm, gfp_t gfp) { struct aead_request *req; req = kmalloc(sizeof(*req) + crypto_aead_reqsize(tfm), gfp); if (likely(req)) aead_request_set_tfm(req, tfm); return req; } /** * aead_request_free() - zeroize and free request data structure * @req: request data structure cipher handle to be freed */ static inline void aead_request_free(struct aead_request *req) { kfree_sensitive(req); } /** * aead_request_set_callback() - set asynchronous callback function * @req: request handle * @flags: specify zero or an ORing of the flags * CRYPTO_TFM_REQ_MAY_BACKLOG the request queue may back log and * increase the wait queue beyond the initial maximum size; * CRYPTO_TFM_REQ_MAY_SLEEP the request processing may sleep * @compl: callback function pointer to be registered with the request handle * @data: The data pointer refers to memory that is not used by the kernel * crypto API, but provided to the callback function for it to use. Here, * the caller can provide a reference to memory the callback function can * operate on. As the callback function is invoked asynchronously to the * related functionality, it may need to access data structures of the * related functionality which can be referenced using this pointer. The * callback function can access the memory via the "data" field in the * crypto_async_request data structure provided to the callback function. * * Setting the callback function that is triggered once the cipher operation * completes * * The callback function is registered with the aead_request handle and * must comply with the following template:: * * void callback_function(struct crypto_async_request *req, int error) */ static inline void aead_request_set_callback(struct aead_request *req, u32 flags, crypto_completion_t compl, void *data) { req->base.complete = compl; req->base.data = data; req->base.flags = flags; } /** * aead_request_set_crypt - set data buffers * @req: request handle * @src: source scatter / gather list * @dst: destination scatter / gather list * @cryptlen: number of bytes to process from @src * @iv: IV for the cipher operation which must comply with the IV size defined * by crypto_aead_ivsize() * * Setting the source data and destination data scatter / gather lists which * hold the associated data concatenated with the plaintext or ciphertext. See * below for the authentication tag. * * For encryption, the source is treated as the plaintext and the * destination is the ciphertext. For a decryption operation, the use is * reversed - the source is the ciphertext and the destination is the plaintext. * * The memory structure for cipher operation has the following structure: * * - AEAD encryption input: assoc data || plaintext * - AEAD encryption output: assoc data || cipherntext || auth tag * - AEAD decryption input: assoc data || ciphertext || auth tag * - AEAD decryption output: assoc data || plaintext * * Albeit the kernel requires the presence of the AAD buffer, however, * the kernel does not fill the AAD buffer in the output case. If the * caller wants to have that data buffer filled, the caller must either * use an in-place cipher operation (i.e. same memory location for * input/output memory location). */ static inline void aead_request_set_crypt(struct aead_request *req, struct scatterlist *src, struct scatterlist *dst, unsigned int cryptlen, u8 *iv) { req->src = src; req->dst = dst; req->cryptlen = cryptlen; req->iv = iv; } /** * aead_request_set_ad - set associated data information * @req: request handle * @assoclen: number of bytes in associated data * * Setting the AD information. This function sets the length of * the associated data. */ static inline void aead_request_set_ad(struct aead_request *req, unsigned int assoclen) { req->assoclen = assoclen; } #endif /* _CRYPTO_AEAD_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/fsnotify_backend.h> #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ struct inotify_event_info { struct fsnotify_event fse; u32 mask; int wd; u32 sync_cookie; int name_len; char name[]; }; struct inotify_inode_mark { struct fsnotify_mark fsn_mark; int wd; }; static inline struct inotify_event_info *INOTIFY_E(struct fsnotify_event *fse) { return container_of(fse, struct inotify_event_info, fse); } extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark, struct fsnotify_group *group); extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *name, u32 cookie); extern const struct fsnotify_ops inotify_fsnotify_ops; extern struct kmem_cache *inotify_inode_mark_cachep; #ifdef CONFIG_INOTIFY_USER static inline void dec_inotify_instances(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_INOTIFY_INSTANCES); } static inline struct ucounts *inc_inotify_watches(struct ucounts *ucounts) { return inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_INOTIFY_WATCHES); } static inline void dec_inotify_watches(struct ucounts *ucounts) { dec_ucount(ucounts, UCOUNT_INOTIFY_WATCHES); } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 // SPDX-License-Identifier: GPL-2.0 #ifndef _LINUX_KERNEL_TRACE_H #define _LINUX_KERNEL_TRACE_H #include <linux/fs.h> #include <linux/atomic.h> #include <linux/sched.h> #include <linux/clocksource.h> #include <linux/ring_buffer.h> #include <linux/mmiotrace.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> #include <linux/trace.h> #include <linux/hw_breakpoint.h> #include <linux/trace_seq.h> #include <linux/trace_events.h> #include <linux/compiler.h> #include <linux/glob.h> #include <linux/irq_work.h> #include <linux/workqueue.h> #include <linux/ctype.h> #ifdef CONFIG_FTRACE_SYSCALLS #include <asm/unistd.h> /* For NR_SYSCALLS */ #include <asm/syscall.h> /* some archs define it here */ #endif enum trace_type { __TRACE_FIRST_TYPE = 0, TRACE_FN, TRACE_CTX, TRACE_WAKE, TRACE_STACK, TRACE_PRINT, TRACE_BPRINT, TRACE_MMIO_RW, TRACE_MMIO_MAP, TRACE_BRANCH, TRACE_GRAPH_RET, TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_BLK, TRACE_BPUTS, TRACE_HWLAT, TRACE_RAW_DATA, __TRACE_LAST_TYPE, }; #undef __field #define __field(type, item) type item; #undef __field_fn #define __field_fn(type, item) type item; #undef __field_struct #define __field_struct(type, item) __field(type, item) #undef __field_desc #define __field_desc(type, container, item) #undef __field_packed #define __field_packed(type, container, item) #undef __array #define __array(type, item, size) type item[size]; #undef __array_desc #define __array_desc(type, container, item, size) #undef __dynamic_array #define __dynamic_array(type, item) type item[]; #undef F_STRUCT #define F_STRUCT(args...) args #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ struct struct_name { \ struct trace_entry ent; \ tstruct \ } #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(name, name_struct, id, tstruct, printk) #undef FTRACE_ENTRY_REG #define FTRACE_ENTRY_REG(name, struct_name, id, tstruct, print, regfn) \ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED #define FTRACE_ENTRY_PACKED(name, struct_name, id, tstruct, print) \ FTRACE_ENTRY(name, struct_name, id, PARAMS(tstruct), PARAMS(print)) __packed #include "trace_entries.h" /* Use this for memory failure errors */ #define MEM_FAIL(condition, fmt, ...) ({ \ static bool __section(".data.once") __warned; \ int __ret_warn_once = !!(condition); \ \ if (unlikely(__ret_warn_once && !__warned)) { \ __warned = true; \ pr_err("ERROR: " fmt, ##__VA_ARGS__); \ } \ unlikely(__ret_warn_once); \ }) /* * syscalls are special, and need special handling, this is why * they are not included in trace_entries.h */ struct syscall_trace_enter { struct trace_entry ent; int nr; unsigned long args[]; }; struct syscall_trace_exit { struct trace_entry ent; int nr; long ret; }; struct kprobe_trace_entry_head { struct trace_entry ent; unsigned long ip; }; struct kretprobe_trace_entry_head { struct trace_entry ent; unsigned long func; unsigned long ret_ip; }; /* * trace_flag_type is an enumeration that holds different * states when a trace occurs. These are: * IRQS_OFF - interrupts were disabled * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags * NEED_RESCHED - reschedule is requested * HARDIRQ - inside an interrupt handler * SOFTIRQ - inside a softirq handler */ enum trace_flag_type { TRACE_FLAG_IRQS_OFF = 0x01, TRACE_FLAG_IRQS_NOSUPPORT = 0x02, TRACE_FLAG_NEED_RESCHED = 0x04, TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, TRACE_FLAG_NMI = 0x40, }; #define TRACE_BUF_SIZE 1024 struct trace_array; /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started * the trace, etc.) */ struct trace_array_cpu { atomic_t disabled; void *buffer_page; /* ring buffer spare */ unsigned long entries; unsigned long saved_latency; unsigned long critical_start; unsigned long critical_end; unsigned long critical_sequence; unsigned long nice; unsigned long policy; unsigned long rt_priority; unsigned long skipped_entries; u64 preempt_timestamp; pid_t pid; kuid_t uid; char comm[TASK_COMM_LEN]; #ifdef CONFIG_FUNCTION_TRACER int ftrace_ignore_pid; #endif bool ignore_pid; }; struct tracer; struct trace_option_dentry; struct array_buffer { struct trace_array *tr; struct trace_buffer *buffer; struct trace_array_cpu __percpu *data; u64 time_start; int cpu; }; #define TRACE_FLAGS_MAX_SIZE 32 struct trace_options { struct tracer *tracer; struct trace_option_dentry *topts; }; struct trace_pid_list { int pid_max; unsigned long *pids; }; enum { TRACE_PIDS = BIT(0), TRACE_NO_PIDS = BIT(1), }; static inline bool pid_type_enabled(int type, struct trace_pid_list *pid_list, struct trace_pid_list *no_pid_list) { /* Return true if the pid list in type has pids */ return ((type & TRACE_PIDS) && pid_list) || ((type & TRACE_NO_PIDS) && no_pid_list); } static inline bool still_need_pid_events(int type, struct trace_pid_list *pid_list, struct trace_pid_list *no_pid_list) { /* * Turning off what is in @type, return true if the "other" * pid list, still has pids in it. */ return (!(type & TRACE_PIDS) && pid_list) || (!(type & TRACE_NO_PIDS) && no_pid_list); } typedef bool (*cond_update_fn_t)(struct trace_array *tr, void *cond_data); /** * struct cond_snapshot - conditional snapshot data and callback * * The cond_snapshot structure encapsulates a callback function and * data associated with the snapshot for a given tracing instance. * * When a snapshot is taken conditionally, by invoking * tracing_snapshot_cond(tr, cond_data), the cond_data passed in is * passed in turn to the cond_snapshot.update() function. That data * can be compared by the update() implementation with the cond_data * contained within the struct cond_snapshot instance associated with * the trace_array. Because the tr->max_lock is held throughout the * update() call, the update() function can directly retrieve the * cond_snapshot and cond_data associated with the per-instance * snapshot associated with the trace_array. * * The cond_snapshot.update() implementation can save data to be * associated with the snapshot if it decides to, and returns 'true' * in that case, or it returns 'false' if the conditional snapshot * shouldn't be taken. * * The cond_snapshot instance is created and associated with the * user-defined cond_data by tracing_cond_snapshot_enable(). * Likewise, the cond_snapshot instance is destroyed and is no longer * associated with the trace instance by * tracing_cond_snapshot_disable(). * * The method below is required. * * @update: When a conditional snapshot is invoked, the update() * callback function is invoked with the tr->max_lock held. The * update() implementation signals whether or not to actually * take the snapshot, by returning 'true' if so, 'false' if no * snapshot should be taken. Because the max_lock is held for * the duration of update(), the implementation is safe to * directly retrieved and save any implementation data it needs * to in association with the snapshot. */ struct cond_snapshot { void *cond_data; cond_update_fn_t update; }; /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: */ struct trace_array { struct list_head list; char *name; struct array_buffer array_buffer; #ifdef CONFIG_TRACER_MAX_TRACE /* * The max_buffer is used to snapshot the trace when a maximum * latency is reached, or when the user initiates a snapshot. * Some tracers will use this to store a maximum trace while * it continues examining live traces. * * The buffers for the max_buffer are set up the same as the array_buffer * When a snapshot is taken, the buffer of the max_buffer is swapped * with the buffer of the array_buffer and the buffers are reset for * the array_buffer so the tracing can continue. */ struct array_buffer max_buffer; bool allocated_snapshot; #endif #if defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER) unsigned long max_latency; #ifdef CONFIG_FSNOTIFY struct dentry *d_max_latency; struct work_struct fsnotify_work; struct irq_work fsnotify_irqwork; #endif #endif struct trace_pid_list __rcu *filtered_pids; struct trace_pid_list __rcu *filtered_no_pids; /* * max_lock is used to protect the swapping of buffers * when taking a max snapshot. The buffers themselves are * protected by per_cpu spinlocks. But the action of the swap * needs its own lock. * * This is defined as a arch_spinlock_t in order to help * with performance when lockdep debugging is enabled. * * It is also used in other places outside the update_max_tr * so it needs to be defined outside of the * CONFIG_TRACER_MAX_TRACE. */ arch_spinlock_t max_lock; int buffer_disabled; #ifdef CONFIG_FTRACE_SYSCALLS int sys_refcount_enter; int sys_refcount_exit; struct trace_event_file __rcu *enter_syscall_files[NR_syscalls]; struct trace_event_file __rcu *exit_syscall_files[NR_syscalls]; #endif int stop_count; int clock_id; int nr_topts; bool clear_trace; int buffer_percent; unsigned int n_err_log_entries; struct tracer *current_trace; unsigned int trace_flags; unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE]; unsigned int flags; raw_spinlock_t start_lock; struct list_head err_log; struct dentry *dir; struct dentry *options; struct dentry *percpu_dir; struct dentry *event_dir; struct trace_options *topts; struct list_head systems; struct list_head events; struct trace_event_file *trace_marker_file; cpumask_var_t tracing_cpumask; /* only trace on set CPUs */ int ref; int trace_ref; #ifdef CONFIG_FUNCTION_TRACER struct ftrace_ops *ops; struct trace_pid_list __rcu *function_pids; struct trace_pid_list __rcu *function_no_pids; #ifdef CONFIG_DYNAMIC_FTRACE /* All of these are protected by the ftrace_lock */ struct list_head func_probes; struct list_head mod_trace; struct list_head mod_notrace; #endif /* function tracing enabled */ int function_enabled; #endif int time_stamp_abs_ref; struct list_head hist_vars; #ifdef CONFIG_TRACER_SNAPSHOT struct cond_snapshot *cond_snapshot; #endif }; enum { TRACE_ARRAY_FL_GLOBAL = (1 << 0) }; extern struct list_head ftrace_trace_arrays; extern struct mutex trace_types_lock; extern int trace_array_get(struct trace_array *tr); extern int tracing_check_open_get_tr(struct trace_array *tr); extern struct trace_array *trace_array_find(const char *instance); extern struct trace_array *trace_array_find_get(const char *instance); extern int tracing_set_time_stamp_abs(struct trace_array *tr, bool abs); extern int tracing_set_clock(struct trace_array *tr, const char *clockstr); extern bool trace_clock_in_ns(struct trace_array *tr); /* * The global tracer (top) should be the first trace array added, * but we check the flag anyway. */ static inline struct trace_array *top_trace_array(void) { struct trace_array *tr; if (list_empty(&ftrace_trace_arrays)) return NULL; tr = list_entry(ftrace_trace_arrays.prev, typeof(*tr), list); WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); return tr; } #define FTRACE_CMP_TYPE(var, type) \ __builtin_types_compatible_p(typeof(var), type *) #undef IF_ASSIGN #define IF_ASSIGN(var, entry, etype, id) \ if (FTRACE_CMP_TYPE(var, etype)) { \ var = (typeof(var))(entry); \ WARN_ON(id != 0 && (entry)->type != id); \ break; \ } /* Will cause compile errors if type is not found. */ extern void __ftrace_bad_type(void); /* * The trace_assign_type is a verifier that the entry type is * the same as the type being assigned. To add new types simply * add a line with the following format: * * IF_ASSIGN(var, ent, type, id); * * Where "type" is the trace type that includes the trace_entry * as the "ent" item. And "id" is the trace identifier that is * used in the trace_type enum. * * If the type can have more than one id, then use zero. */ #define trace_assign_type(var, ent) \ do { \ IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \ IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \ IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct hwlat_entry, TRACE_HWLAT); \ IF_ASSIGN(var, ent, struct raw_data_entry, TRACE_RAW_DATA);\ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ TRACE_MMIO_MAP); \ IF_ASSIGN(var, ent, struct trace_branch, TRACE_BRANCH); \ IF_ASSIGN(var, ent, struct ftrace_graph_ent_entry, \ TRACE_GRAPH_ENT); \ IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry, \ TRACE_GRAPH_RET); \ __ftrace_bad_type(); \ } while (0) /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the * flags value in struct tracer_flags. */ struct tracer_opt { const char *name; /* Will appear on the trace_options file */ u32 bit; /* Mask assigned in val field in tracer_flags */ }; /* * The set of specific options for a tracer. Your tracer * have to set the initial value of the flags val. */ struct tracer_flags { u32 val; struct tracer_opt *opts; struct tracer *trace; }; /* Makes more easy to define a tracer opt */ #define TRACER_OPT(s, b) .name = #s, .bit = b struct trace_option_dentry { struct tracer_opt *opt; struct tracer_flags *flags; struct trace_array *tr; struct dentry *entry; }; /** * struct tracer - a specific tracer and its callbacks to interact with tracefs * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer * @start: called when tracing is unpaused (echo 1 > tracing_on) * @stop: called when tracing is paused (echo 0 > tracing_on) * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened * @close: called when the trace file is released * @pipe_close: called when the trace_pipe file is released * @read: override the default read callback on trace_pipe * @splice_read: override the default splice_read callback on trace_pipe * @selftest: selftest to run on boot (see trace_selftest.c) * @print_headers: override the first lines that describe your columns * @print_line: callback that prints a trace * @set_flag: signals one of your private flags changed (trace_options file) * @flags: your private flags */ struct tracer { const char *name; int (*init)(struct trace_array *tr); void (*reset)(struct trace_array *tr); void (*start)(struct trace_array *tr); void (*stop)(struct trace_array *tr); int (*update_thresh)(struct trace_array *tr); void (*open)(struct trace_iterator *iter); void (*pipe_open)(struct trace_iterator *iter); void (*close)(struct trace_iterator *iter); void (*pipe_close)(struct trace_iterator *iter); ssize_t (*read)(struct trace_iterator *iter, struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos); ssize_t (*splice_read)(struct trace_iterator *iter, struct file *filp, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); #ifdef CONFIG_FTRACE_STARTUP_TEST int (*selftest)(struct tracer *trace, struct trace_array *tr); #endif void (*print_header)(struct seq_file *m); enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(struct trace_array *tr, u32 old_flags, u32 bit, int set); /* Return 0 if OK with change, else return non-zero */ int (*flag_changed)(struct trace_array *tr, u32 mask, int set); struct tracer *next; struct tracer_flags *flags; int enabled; bool print_max; bool allow_instances; #ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; #endif /* True if tracer cannot be enabled in kernel param */ bool noboot; }; /* Only current can touch trace_recursion */ /* * For function tracing recursion: * The order of these bits are important. * * When function tracing occurs, the following steps are made: * If arch does not support a ftrace feature: * call internal function (uses INTERNAL bits) which calls... * If callback is registered to the "global" list, the list * function is called and recursion checks the GLOBAL bits. * then this function calls... * The function callback, which can use the FTRACE bits to * check for recursion. */ enum { /* Function recursion bits */ TRACE_FTRACE_BIT, TRACE_FTRACE_NMI_BIT, TRACE_FTRACE_IRQ_BIT, TRACE_FTRACE_SIRQ_BIT, TRACE_FTRACE_TRANSITION_BIT, /* Internal use recursion bits */ TRACE_INTERNAL_BIT, TRACE_INTERNAL_NMI_BIT, TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, TRACE_INTERNAL_TRANSITION_BIT, TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. * As we need a way to maintain state if we are tracing the function * graph in irq because we want to trace a particular function that * was called in irq context but we have irq tracing off. Since this * can only be modified by current, we can reuse trace_recursion. */ TRACE_IRQ_BIT, /* Set if the function is in the set_graph_function file */ TRACE_GRAPH_BIT, /* * In the very unlikely case that an interrupt came in * at a start of graph tracing, and we want to trace * the function in that interrupt, the depth can be greater * than zero, because of the preempted start of a previous * trace. In an even more unlikely case, depth could be 2 * if a softirq interrupted the start of graph tracing, * followed by an interrupt preempting a start of graph * tracing in the softirq, and depth can even be 3 * if an NMI came in at the start of an interrupt function * that preempted a softirq start of a function that * preempted normal context!!!! Luckily, it can't be * greater than 3, so the next two bits are a mask * of what the depth is when we set TRACE_GRAPH_BIT */ TRACE_GRAPH_DEPTH_START_BIT, TRACE_GRAPH_DEPTH_END_BIT, /* * To implement set_graph_notrace, if this bit is set, we ignore * function graph tracing of called functions, until the return * function is called to clear it. */ TRACE_GRAPH_NOTRACE_BIT, }; #define trace_recursion_set(bit) do { (current)->trace_recursion |= (1<<(bit)); } while (0) #define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(1<<(bit)); } while (0) #define trace_recursion_test(bit) ((current)->trace_recursion & (1<<(bit))) #define trace_recursion_depth() \ (((current)->trace_recursion >> TRACE_GRAPH_DEPTH_START_BIT) & 3) #define trace_recursion_set_depth(depth) \ do { \ current->trace_recursion &= \ ~(3 << TRACE_GRAPH_DEPTH_START_BIT); \ current->trace_recursion |= \ ((depth) & 3) << TRACE_GRAPH_DEPTH_START_BIT; \ } while (0) #define TRACE_CONTEXT_BITS 4 #define TRACE_FTRACE_START TRACE_FTRACE_BIT #define TRACE_LIST_START TRACE_INTERNAL_BIT #define TRACE_CONTEXT_MASK ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1) enum { TRACE_CTX_NMI, TRACE_CTX_IRQ, TRACE_CTX_SOFTIRQ, TRACE_CTX_NORMAL, TRACE_CTX_TRANSITION, }; static __always_inline int trace_get_context_bit(void) { int bit; if (in_interrupt()) { if (in_nmi()) bit = TRACE_CTX_NMI; else if (in_irq()) bit = TRACE_CTX_IRQ; else bit = TRACE_CTX_SOFTIRQ; } else bit = TRACE_CTX_NORMAL; return bit; } static __always_inline int trace_test_and_set_recursion(int start) { unsigned int val = current->trace_recursion; int bit; bit = trace_get_context_bit() + start; if (unlikely(val & (1 << bit))) { /* * It could be that preempt_count has not been updated during * a switch between contexts. Allow for a single recursion. */ bit = start + TRACE_CTX_TRANSITION; if (trace_recursion_test(bit)) return -1; trace_recursion_set(bit); barrier(); return bit; } val |= 1 << bit; current->trace_recursion = val; barrier(); return bit; } static __always_inline void trace_clear_recursion(int bit) { unsigned int val = current->trace_recursion; bit = 1 << bit; val &= ~bit; barrier(); current->trace_recursion = val; } static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { return iter->buffer_iter ? iter->buffer_iter[cpu] : NULL; } int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void tracing_reset_online_cpus(struct array_buffer *buf); void tracing_reset_current(int cpu); void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); int tracing_open_generic_tr(struct inode *inode, struct file *filp); bool tracing_is_disabled(void); bool tracer_tracing_is_on(struct trace_array *tr); void tracer_tracing_on(struct trace_array *tr); void tracer_tracing_off(struct trace_array *tr); struct dentry *trace_create_file(const char *name, umode_t mode, struct dentry *parent, void *data, const struct file_operations *fops); int tracing_init_dentry(void); struct ring_buffer_event; struct ring_buffer_event * trace_buffer_lock_reserve(struct trace_buffer *buffer, int type, unsigned long len, unsigned long flags, int pc); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); struct trace_entry *trace_find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts); void trace_buffer_unlock_commit_nostack(struct trace_buffer *buffer, struct ring_buffer_event *event); int trace_empty(struct trace_iterator *iter); void *trace_find_next_entry_inc(struct trace_iterator *iter); void trace_init_global_iter(struct trace_iterator *iter); void tracing_iter_reset(struct trace_iterator *iter, int cpu); unsigned long trace_total_entries_cpu(struct trace_array *tr, int cpu); unsigned long trace_total_entries(struct trace_array *tr); void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); void trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); void trace_latency_header(struct seq_file *m); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); void trace_graph_return(struct ftrace_graph_ret *trace); int trace_graph_entry(struct ftrace_graph_ent *trace); void set_graph_array(struct trace_array *tr); void tracing_start_cmdline_record(void); void tracing_stop_cmdline_record(void); void tracing_start_tgid_record(void); void tracing_stop_tgid_record(void); int register_tracer(struct tracer *type); int is_tracing_stopped(void); loff_t tracing_lseek(struct file *file, loff_t offset, int whence); extern cpumask_var_t __read_mostly tracing_buffer_mask; #define for_each_tracing_cpu(cpu) \ for_each_cpu(cpu, tracing_buffer_mask) extern unsigned long nsecs_to_usecs(unsigned long nsecs); extern unsigned long tracing_thresh; /* PID filtering */ extern int pid_max; bool trace_find_filtered_pid(struct trace_pid_list *filtered_pids, pid_t search_pid); bool trace_ignore_this_task(struct trace_pid_list *filtered_pids, struct trace_pid_list *filtered_no_pids, struct task_struct *task); void trace_filter_add_remove_task(struct trace_pid_list *pid_list, struct task_struct *self, struct task_struct *task); void *trace_pid_next(struct trace_pid_list *pid_list, void *v, loff_t *pos); void *trace_pid_start(struct trace_pid_list *pid_list, loff_t *pos); int trace_pid_show(struct seq_file *m, void *v); void trace_free_pid_list(struct trace_pid_list *pid_list); int trace_pid_write(struct trace_pid_list *filtered_pids, struct trace_pid_list **new_pid_list, const char __user *ubuf, size_t cnt); #ifdef CONFIG_TRACER_MAX_TRACE void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu, void *cond_data); void update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu); #endif /* CONFIG_TRACER_MAX_TRACE */ #if (defined(CONFIG_TRACER_MAX_TRACE) || defined(CONFIG_HWLAT_TRACER)) && \ defined(CONFIG_FSNOTIFY) void latency_fsnotify(struct trace_array *tr); #else static inline void latency_fsnotify(struct trace_array *tr) { } #endif #ifdef CONFIG_STACKTRACE void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else static inline void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { } #endif /* CONFIG_STACKTRACE */ extern u64 ftrace_now(int cpu); extern void trace_find_cmdline(int pid, char comm[]); extern int trace_find_tgid(int pid); extern void trace_event_follow_fork(struct trace_array *tr, bool enable); #ifdef CONFIG_DYNAMIC_FTRACE extern unsigned long ftrace_update_tot_cnt; extern unsigned long ftrace_number_of_pages; extern unsigned long ftrace_number_of_groups; void ftrace_init_trace_array(struct trace_array *tr); #else static inline void ftrace_init_trace_array(struct trace_array *tr) { } #endif #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; #ifdef CONFIG_FTRACE_STARTUP_TEST extern void __init disable_tracing_selftest(const char *reason); extern int trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_function_graph(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); /* * Tracer data references selftest functions that only occur * on boot up. These can be __init functions. Thus, when selftests * are enabled, then the tracers need to reference __init functions. */ #define __tracer_data __refdata #else static inline void __init disable_tracing_selftest(const char *reason) { } /* Tracers are seldom changed. Optimize when selftests are disabled. */ #define __tracer_data __read_mostly #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args); extern int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); extern char trace_find_mark(unsigned long long duration); struct ftrace_hash; struct ftrace_mod_load { struct list_head list; char *func; char *module; int enable; }; enum { FTRACE_HASH_FL_MOD = (1 << 0), }; struct ftrace_hash { unsigned long size_bits; struct hlist_head *buckets; unsigned long count; unsigned long flags; struct rcu_head rcu; }; struct ftrace_func_entry * ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip); static __always_inline bool ftrace_hash_empty(struct ftrace_hash *hash) { return !hash || !(hash->count || (hash->flags & FTRACE_HASH_FL_MOD)); } /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER /* Flag options */ #define TRACE_GRAPH_PRINT_OVERRUN 0x1 #define TRACE_GRAPH_PRINT_CPU 0x2 #define TRACE_GRAPH_PRINT_OVERHEAD 0x4 #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 #define TRACE_GRAPH_PRINT_REL_TIME 0x40 #define TRACE_GRAPH_PRINT_IRQS 0x80 #define TRACE_GRAPH_PRINT_TAIL 0x100 #define TRACE_GRAPH_SLEEP_TIME 0x200 #define TRACE_GRAPH_GRAPH_TIME 0x400 #define TRACE_GRAPH_PRINT_FILL_SHIFT 28 #define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT) extern void ftrace_graph_sleep_time_control(bool enable); #ifdef CONFIG_FUNCTION_PROFILER extern void ftrace_graph_graph_time_control(bool enable); #else static inline void ftrace_graph_graph_time_control(bool enable) { } #endif extern enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags); extern void print_graph_headers_flags(struct seq_file *s, u32 flags); extern void trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); extern void graph_trace_open(struct trace_iterator *iter); extern void graph_trace_close(struct trace_iterator *iter); extern int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, int pc); extern void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, int pc); #ifdef CONFIG_DYNAMIC_FTRACE extern struct ftrace_hash __rcu *ftrace_graph_hash; extern struct ftrace_hash __rcu *ftrace_graph_notrace_hash; static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { unsigned long addr = trace->func; int ret = 0; struct ftrace_hash *hash; preempt_disable_notrace(); /* * Have to open code "rcu_dereference_sched()" because the * function graph tracer can be called when RCU is not * "watching". * Protected with schedule_on_each_cpu(ftrace_sync) */ hash = rcu_dereference_protected(ftrace_graph_hash, !preemptible()); if (ftrace_hash_empty(hash)) { ret = 1; goto out; } if (ftrace_lookup_ip(hash, addr)) { /* * This needs to be cleared on the return functions * when the depth is zero. */ trace_recursion_set(TRACE_GRAPH_BIT); trace_recursion_set_depth(trace->depth); /* * If no irqs are to be traced, but a set_graph_function * is set, and called by an interrupt handler, we still * want to trace it. */ if (in_irq()) trace_recursion_set(TRACE_IRQ_BIT); else trace_recursion_clear(TRACE_IRQ_BIT); ret = 1; } out: preempt_enable_notrace(); return ret; } static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) { if (trace_recursion_test(TRACE_GRAPH_BIT) && trace->depth == trace_recursion_depth()) trace_recursion_clear(TRACE_GRAPH_BIT); } static inline int ftrace_graph_notrace_addr(unsigned long addr) { int ret = 0; struct ftrace_hash *notrace_hash; preempt_disable_notrace(); /* * Have to open code "rcu_dereference_sched()" because the * function graph tracer can be called when RCU is not * "watching". * Protected with schedule_on_each_cpu(ftrace_sync) */ notrace_hash = rcu_dereference_protected(ftrace_graph_notrace_hash, !preemptible()); if (ftrace_lookup_ip(notrace_hash, addr)) ret = 1; preempt_enable_notrace(); return ret; } #else static inline int ftrace_graph_addr(struct ftrace_graph_ent *trace) { return 1; } static inline int ftrace_graph_notrace_addr(unsigned long addr) { return 0; } static inline void ftrace_graph_addr_finish(struct ftrace_graph_ret *trace) { } #endif /* CONFIG_DYNAMIC_FTRACE */ extern unsigned int fgraph_max_depth; static inline bool ftrace_graph_ignore_func(struct ftrace_graph_ent *trace) { /* trace it when it is-nested-in or is a function enabled. */ return !(trace_recursion_test(TRACE_GRAPH_BIT) || ftrace_graph_addr(trace)) || (trace->depth < 0) || (fgraph_max_depth && trace->depth >= fgraph_max_depth); } #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function_flags(struct trace_iterator *iter, u32 flags) { return TRACE_TYPE_UNHANDLED; } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ extern struct list_head ftrace_pids; #ifdef CONFIG_FUNCTION_TRACER #define FTRACE_PID_IGNORE -1 #define FTRACE_PID_TRACE -2 struct ftrace_func_command { struct list_head list; char *name; int (*func)(struct trace_array *tr, struct ftrace_hash *hash, char *func, char *cmd, char *params, int enable); }; extern bool ftrace_filter_param __initdata; static inline int ftrace_trace_task(struct trace_array *tr) { return this_cpu_read(tr->array_buffer.data->ftrace_ignore_pid) != FTRACE_PID_IGNORE; } extern int ftrace_is_dead(void); int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent); void ftrace_destroy_function_files(struct trace_array *tr); int ftrace_allocate_ftrace_ops(struct trace_array *tr); void ftrace_free_ftrace_ops(struct trace_array *tr); void ftrace_init_global_array_ops(struct trace_array *tr); void ftrace_init_array_ops(struct trace_array *tr, ftrace_func_t func); void ftrace_reset_array_ops(struct trace_array *tr); void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer); void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d_tracer); void ftrace_clear_pids(struct trace_array *tr); int init_function_trace(void); void ftrace_pid_follow_fork(struct trace_array *tr, bool enable); #else static inline int ftrace_trace_task(struct trace_array *tr) { return 1; } static inline int ftrace_is_dead(void) { return 0; } static inline int ftrace_create_function_files(struct trace_array *tr, struct dentry *parent) { return 0; } static inline int ftrace_allocate_ftrace_ops(struct trace_array *tr) { return 0; } static inline void ftrace_free_ftrace_ops(struct trace_array *tr) { } static inline void ftrace_destroy_function_files(struct trace_array *tr) { } static inline __init void ftrace_init_global_array_ops(struct trace_array *tr) { } static inline void ftrace_reset_array_ops(struct trace_array *tr) { } static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { } static inline void ftrace_clear_pids(struct trace_array *tr) { } static inline int init_function_trace(void) { return 0; } static inline void ftrace_pid_follow_fork(struct trace_array *tr, bool enable) { } /* ftace_func_t type is not defined, use macro instead of static inline */ #define ftrace_init_array_ops(tr, func) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER */ #if defined(CONFIG_FUNCTION_TRACER) && defined(CONFIG_DYNAMIC_FTRACE) struct ftrace_probe_ops { void (*func)(unsigned long ip, unsigned long parent_ip, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data); int (*init)(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *init_data, void **data); void (*free)(struct ftrace_probe_ops *ops, struct trace_array *tr, unsigned long ip, void *data); int (*print)(struct seq_file *m, unsigned long ip, struct ftrace_probe_ops *ops, void *data); }; struct ftrace_func_mapper; typedef int (*ftrace_mapper_func)(void *data); struct ftrace_func_mapper *allocate_ftrace_func_mapper(void); void **ftrace_func_mapper_find_ip(struct ftrace_func_mapper *mapper, unsigned long ip); int ftrace_func_mapper_add_ip(struct ftrace_func_mapper *mapper, unsigned long ip, void *data); void *ftrace_func_mapper_remove_ip(struct ftrace_func_mapper *mapper, unsigned long ip); void free_ftrace_func_mapper(struct ftrace_func_mapper *mapper, ftrace_mapper_func free_func); extern int register_ftrace_function_probe(char *glob, struct trace_array *tr, struct ftrace_probe_ops *ops, void *data); extern int unregister_ftrace_function_probe_func(char *glob, struct trace_array *tr, struct ftrace_probe_ops *ops); extern void clear_ftrace_function_probes(struct trace_array *tr); int register_ftrace_command(struct ftrace_func_command *cmd); int unregister_ftrace_command(struct ftrace_func_command *cmd); void ftrace_create_filter_files(struct ftrace_ops *ops, struct dentry *parent); void ftrace_destroy_filter_files(struct ftrace_ops *ops); extern int ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); extern int ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf, int len, int reset); #else struct ftrace_func_command; static inline __init int register_ftrace_command(struct ftrace_func_command *cmd) { return -EINVAL; } static inline __init int unregister_ftrace_command(char *cmd_name) { return -EINVAL; } static inline void clear_ftrace_function_probes(struct trace_array *tr) { } /* * The ops parameter passed in is usually undefined. * This must be a macro. */ #define ftrace_create_filter_files(ops, parent) do { } while (0) #define ftrace_destroy_filter_files(ops) do { } while (0) #endif /* CONFIG_FUNCTION_TRACER && CONFIG_DYNAMIC_FTRACE */ bool ftrace_event_is_function(struct trace_event_call *call); /* * struct trace_parser - servers for reading the user input separated by spaces * @cont: set if the input is not complete - no final space char was found * @buffer: holds the parsed user input * @idx: user input length * @size: buffer size */ struct trace_parser { bool cont; char *buffer; unsigned idx; unsigned size; }; static inline bool trace_parser_loaded(struct trace_parser *parser) { return (parser->idx != 0); } static inline bool trace_parser_cont(struct trace_parser *parser) { return parser->cont; } static inline void trace_parser_clear(struct trace_parser *parser) { parser->cont = false; parser->idx = 0; } extern int trace_parser_get_init(struct trace_parser *parser, int size); extern void trace_parser_put(struct trace_parser *parser); extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf, size_t cnt, loff_t *ppos); /* * Only create function graph options if function graph is configured. */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER # define FGRAPH_FLAGS \ C(DISPLAY_GRAPH, "display-graph"), #else # define FGRAPH_FLAGS #endif #ifdef CONFIG_BRANCH_TRACER # define BRANCH_FLAGS \ C(BRANCH, "branch"), #else # define BRANCH_FLAGS #endif #ifdef CONFIG_FUNCTION_TRACER # define FUNCTION_FLAGS \ C(FUNCTION, "function-trace"), \ C(FUNC_FORK, "function-fork"), # define FUNCTION_DEFAULT_FLAGS TRACE_ITER_FUNCTION #else # define FUNCTION_FLAGS # define FUNCTION_DEFAULT_FLAGS 0UL # define TRACE_ITER_FUNC_FORK 0UL #endif #ifdef CONFIG_STACKTRACE # define STACK_FLAGS \ C(STACKTRACE, "stacktrace"), #else # define STACK_FLAGS #endif /* * trace_iterator_flags is an enumeration that defines bit * positions into trace_flags that controls the output. * * NOTE: These bits must match the trace_options array in * trace.c (this macro guarantees it). */ #define TRACE_FLAGS \ C(PRINT_PARENT, "print-parent"), \ C(SYM_OFFSET, "sym-offset"), \ C(SYM_ADDR, "sym-addr"), \ C(VERBOSE, "verbose"), \ C(RAW, "raw"), \ C(HEX, "hex"), \ C(BIN, "bin"), \ C(BLOCK, "block"), \ C(PRINTK, "trace_printk"), \ C(ANNOTATE, "annotate"), \ C(USERSTACKTRACE, "userstacktrace"), \ C(SYM_USEROBJ, "sym-userobj"), \ C(PRINTK_MSGONLY, "printk-msg-only"), \ C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \ C(LATENCY_FMT, "latency-format"), \ C(RECORD_CMD, "record-cmd"), \ C(RECORD_TGID, "record-tgid"), \ C(OVERWRITE, "overwrite"), \ C(STOP_ON_FREE, "disable_on_free"), \ C(IRQ_INFO, "irq-info"), \ C(MARKERS, "markers"), \ C(EVENT_FORK, "event-fork"), \ C(PAUSE_ON_TRACE, "pause-on-trace"), \ FUNCTION_FLAGS \ FGRAPH_FLAGS \ STACK_FLAGS \ BRANCH_FLAGS /* * By defining C, we can make TRACE_FLAGS a list of bit names * that will define the bits for the flag masks. */ #undef C #define C(a, b) TRACE_ITER_##a##_BIT enum trace_iterator_bits { TRACE_FLAGS /* Make sure we don't go more than we have bits for */ TRACE_ITER_LAST_BIT }; /* * By redefining C, we can make TRACE_FLAGS a list of masks that * use the bits as defined above. */ #undef C #define C(a, b) TRACE_ITER_##a = (1 << TRACE_ITER_##a##_BIT) enum trace_iterator_flags { TRACE_FLAGS }; /* * TRACE_ITER_SYM_MASK masks the options in trace_flags that * control the output of kernel symbols. */ #define TRACE_ITER_SYM_MASK \ (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR) extern struct tracer nop_trace; #ifdef CONFIG_BRANCH_TRACER extern int enable_branch_tracing(struct trace_array *tr); extern void disable_branch_tracing(void); static inline int trace_branch_enable(struct trace_array *tr) { if (tr->trace_flags & TRACE_ITER_BRANCH) return enable_branch_tracing(tr); return 0; } static inline void trace_branch_disable(void) { /* due to races, always disable */ disable_branch_tracing(); } #else static inline int trace_branch_enable(struct trace_array *tr) { return 0; } static inline void trace_branch_disable(void) { } #endif /* CONFIG_BRANCH_TRACER */ /* set ring buffers to default size if not already done so */ int tracing_update_buffers(void); struct ftrace_event_field { struct list_head link; const char *name; const char *type; int filter_type; int offset; int size; int is_signed; }; struct prog_entry; struct event_filter { struct prog_entry __rcu *prog; char *filter_string; }; struct event_subsystem { struct list_head list; const char *name; struct event_filter *filter; int ref_count; }; struct trace_subsystem_dir { struct list_head list; struct event_subsystem *subsystem; struct trace_array *tr; struct dentry *entry; int ref_count; int nr_events; }; extern int call_filter_check_discard(struct trace_event_call *call, void *rec, struct trace_buffer *buffer, struct ring_buffer_event *event); void trace_buffer_unlock_commit_regs(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc, struct pt_regs *regs); static inline void trace_buffer_unlock_commit(struct trace_array *tr, struct trace_buffer *buffer, struct ring_buffer_event *event, unsigned long flags, int pc) { trace_buffer_unlock_commit_regs(tr, buffer, event, flags, pc, NULL); } DECLARE_PER_CPU(struct ring_buffer_event *, trace_buffered_event); DECLARE_PER_CPU(int, trace_buffered_event_cnt); void trace_buffered_event_disable(void); void trace_buffered_event_enable(void); static inline void __trace_event_discard_commit(struct trace_buffer *buffer, struct ring_buffer_event *event) { if (this_cpu_read(trace_buffered_event) == event) { /* Simply release the temp buffer */ this_cpu_dec(trace_buffered_event_cnt); return; } ring_buffer_discard_commit(buffer, event); } /* * Helper function for event_trigger_unlock_commit{_regs}(). * If there are event triggers attached to this event that requires * filtering against its fields, then they will be called as the * entry already holds the field information of the current event. * * It also checks if the event should be discarded or not. * It is to be discarded if the event is soft disabled and the * event was only recorded to process triggers, or if the event * filter is active and this event did not match the filters. * * Returns true if the event is discarded, false otherwise. */ static inline bool __event_trigger_test_discard(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, enum event_trigger_type *tt) { unsigned long eflags = file->flags; if (eflags & EVENT_FILE_FL_TRIGGER_COND) *tt = event_triggers_call(file, entry, event); if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED | EVENT_FILE_FL_FILTERED | EVENT_FILE_FL_PID_FILTER)))) return false; if (file->flags & EVENT_FILE_FL_SOFT_DISABLED) goto discard; if (file->flags & EVENT_FILE_FL_FILTERED && !filter_match_preds(file->filter, entry)) goto discard; if ((file->flags & EVENT_FILE_FL_PID_FILTER) && trace_event_ignore_this_pid(file)) goto discard; return false; discard: __trace_event_discard_commit(buffer, event); return true; } /** * event_trigger_unlock_commit - handle triggers and finish event commit * @file: The file pointer assoctiated to the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself * @irq_flags: The state of the interrupts at the start of the event * @pc: The state of the preempt count at the start of the event. * * This is a helper function to handle triggers that require data * from the event itself. It also tests the event against filters and * if the event is soft disabled and should be discarded. */ static inline void event_trigger_unlock_commit(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc) { enum event_trigger_type tt = ETT_NONE; if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) trace_buffer_unlock_commit(file->tr, buffer, event, irq_flags, pc); if (tt) event_triggers_post_call(file, tt); } /** * event_trigger_unlock_commit_regs - handle triggers and finish event commit * @file: The file pointer assoctiated to the event * @buffer: The ring buffer that the event is being written to * @event: The event meta data in the ring buffer * @entry: The event itself * @irq_flags: The state of the interrupts at the start of the event * @pc: The state of the preempt count at the start of the event. * * This is a helper function to handle triggers that require data * from the event itself. It also tests the event against filters and * if the event is soft disabled and should be discarded. * * Same as event_trigger_unlock_commit() but calls * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit(). */ static inline void event_trigger_unlock_commit_regs(struct trace_event_file *file, struct trace_buffer *buffer, struct ring_buffer_event *event, void *entry, unsigned long irq_flags, int pc, struct pt_regs *regs) { enum event_trigger_type tt = ETT_NONE; if (!__event_trigger_test_discard(file, buffer, event, entry, &tt)) trace_buffer_unlock_commit_regs(file->tr, buffer, event, irq_flags, pc, regs); if (tt) event_triggers_post_call(file, tt); } #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) /* * The max preds is the size of unsigned short with * two flags at the MSBs. One bit is used for both the IS_RIGHT * and FOLD flags. The other is reserved. * * 2^14 preds is way more than enough. */ #define MAX_FILTER_PRED 16384 struct filter_pred; struct regex; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); typedef int (*regex_match_func)(char *str, struct regex *r, int len); enum regex_type { MATCH_FULL = 0, MATCH_FRONT_ONLY, MATCH_MIDDLE_ONLY, MATCH_END_ONLY, MATCH_GLOB, MATCH_INDEX, }; struct regex { char pattern[MAX_FILTER_STR_VAL]; int len; int field_len; regex_match_func match; }; struct filter_pred { filter_pred_fn_t fn; u64 val; struct regex regex; unsigned short *ops; struct ftrace_event_field *field; int offset; int not; int op; }; static inline bool is_string_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_DYN_STRING || field->filter_type == FILTER_STATIC_STRING || field->filter_type == FILTER_PTR_STRING || field->filter_type == FILTER_COMM; } static inline bool is_function_field(struct ftrace_event_field *field) { return field->filter_type == FILTER_TRACE_FN; } extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct trace_event_file *file, struct trace_seq *s); extern int apply_event_filter(struct trace_event_file *file, char *filter_string); extern int apply_subsystem_event_filter(struct trace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); extern int create_event_filter(struct trace_array *tr, struct trace_event_call *call, char *filter_str, bool set_str, struct event_filter **filterp); extern void free_event_filter(struct event_filter *filter); struct ftrace_event_field * trace_find_event_field(struct trace_event_call *call, char *name); extern void trace_event_enable_cmd_record(bool enable); extern void trace_event_enable_tgid_record(bool enable); extern int event_trace_init(void); extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); extern int event_trace_del_tracer(struct trace_array *tr); extern void __trace_early_add_events(struct trace_array *tr); extern struct trace_event_file *__find_event_file(struct trace_array *tr, const char *system, const char *event); extern struct trace_event_file *find_event_file(struct trace_array *tr, const char *system, const char *event); static inline void *event_file_data(struct file *filp) { return READ_ONCE(file_inode(filp)->i_private); } extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const struct file_operations event_trigger_fops; extern const struct file_operations event_hist_fops; extern const struct file_operations event_hist_debug_fops; extern const struct file_operations event_inject_fops; #ifdef CONFIG_HIST_TRIGGERS extern int register_trigger_hist_cmd(void); extern int register_trigger_hist_enable_disable_cmds(void); #else static inline int register_trigger_hist_cmd(void) { return 0; } static inline int register_trigger_hist_enable_disable_cmds(void) { return 0; } #endif extern int register_trigger_cmds(void); extern void clear_event_triggers(struct trace_array *tr); struct event_trigger_data { unsigned long count; int ref; struct event_trigger_ops *ops; struct event_command *cmd_ops; struct event_filter __rcu *filter; char *filter_str; void *private_data; bool paused; bool paused_tmp; struct list_head list; char *name; struct list_head named_list; struct event_trigger_data *named_data; }; /* Avoid typos */ #define ENABLE_EVENT_STR "enable_event" #define DISABLE_EVENT_STR "disable_event" #define ENABLE_HIST_STR "enable_hist" #define DISABLE_HIST_STR "disable_hist" struct enable_trigger_data { struct trace_event_file *file; bool enable; bool hist; }; extern int event_enable_trigger_print(struct seq_file *m, struct event_trigger_ops *ops, struct event_trigger_data *data); extern void event_enable_trigger_free(struct event_trigger_ops *ops, struct event_trigger_data *data); extern int event_enable_trigger_func(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *param); extern int event_enable_register_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); extern void event_enable_unregister_trigger(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *test, struct trace_event_file *file); extern void trigger_data_free(struct event_trigger_data *data); extern int event_trigger_init(struct event_trigger_ops *ops, struct event_trigger_data *data); extern int trace_event_trigger_enable_disable(struct trace_event_file *file, int trigger_enable); extern void update_cond_flag(struct trace_event_file *file); extern int set_trigger_filter(char *filter_str, struct event_trigger_data *trigger_data, struct trace_event_file *file); extern struct event_trigger_data *find_named_trigger(const char *name); extern bool is_named_trigger(struct event_trigger_data *test); extern int save_named_trigger(const char *name, struct event_trigger_data *data); extern void del_named_trigger(struct event_trigger_data *data); extern void pause_named_trigger(struct event_trigger_data *data); extern void unpause_named_trigger(struct event_trigger_data *data); extern void set_named_trigger_data(struct event_trigger_data *data, struct event_trigger_data *named_data); extern struct event_trigger_data * get_named_trigger_data(struct event_trigger_data *data); extern int register_event_command(struct event_command *cmd); extern int unregister_event_command(struct event_command *cmd); extern int register_trigger_hist_enable_disable_cmds(void); /** * struct event_trigger_ops - callbacks for trace event triggers * * The methods in this structure provide per-event trigger hooks for * various trigger operations. * * All the methods below, except for @init() and @free(), must be * implemented. * * @func: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that * registered the trigger (see struct event_command) along with * the trace record, rec. * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() * function). This can be used to perform per-trigger * initialization such as incrementing a per-trigger reference * count, for instance. This is usually implemented by the * generic utility function @event_trigger_init() (see * trace_event_triggers.c). * * @free: An optional de-initialization function called for the * trigger when the trigger is unregistered (via the * event_command @reg() function). This can be used to perform * per-trigger de-initialization such as decrementing a * per-trigger reference count and freeing corresponding trigger * data, for instance. This is usually implemented by the * generic utility function @event_trigger_free() (see * trace_event_triggers.c). * * @print: The callback function invoked to have the trigger print * itself. This is usually implemented by a wrapper function * that calls the generic utility function @event_trigger_print() * (see trace_event_triggers.c). */ struct event_trigger_ops { void (*func)(struct event_trigger_data *data, void *rec, struct ring_buffer_event *rbe); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, struct event_trigger_data *data); int (*print)(struct seq_file *m, struct event_trigger_ops *ops, struct event_trigger_data *data); }; /** * struct event_command - callbacks and data members for event commands * * Event commands are invoked by users by writing the command name * into the 'trigger' file associated with a trace event. The * parameters associated with a specific invocation of an event * command are used to create an event trigger instance, which is * added to the list of trigger instances associated with that trace * event. When the event is hit, the set of triggers associated with * that event is invoked. * * The data members in this structure provide per-event command data * for various event commands. * * All the data members below, except for @post_trigger, must be set * for each event command. * * @name: The unique name that identifies the event command. This is * the name used when setting triggers via trigger files. * * @trigger_type: A unique id that identifies the event command * 'type'. This value has two purposes, the first to ensure that * only one trigger of the same type can be set at a given time * for a particular event e.g. it doesn't make sense to have both * a traceon and traceoff trigger attached to a single event at * the same time, so traceon and traceoff have the same type * though they have different names. The @trigger_type value is * also used as a bit value for deferring the actual trigger * action until after the current event is finished. Some * commands need to do this if they themselves log to the trace * buffer (see the @post_trigger() member below). @trigger_type * values are defined by adding new values to the trigger_type * enum in include/linux/trace_events.h. * * @flags: See the enum event_command_flags below. * * All the methods below, except for @set_filter() and @unreg_all(), * must be implemented. * * @func: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the * user. It allocates the trigger instance and registers it with * the appropriate trace event. It makes use of the other * event_command callback functions to orchestrate this, and is * usually implemented by the generic utility function * @event_trigger_callback() (see trace_event_triggers.c). * * @reg: Adds the trigger to the list of triggers associated with the * event, and enables the event trigger itself, after * initializing it (via the event_trigger_ops @init() function). * This is also where commands can use the @trigger_type value to * make the decision as to whether or not multiple instances of * the trigger should be allowed. This is usually implemented by * the generic utility function @register_trigger() (see * trace_event_triggers.c). * * @unreg: Removes the trigger from the list of triggers associated * with the event, and disables the event trigger itself, after * initializing it (via the event_trigger_ops @free() function). * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * * @unreg_all: An optional function called to remove all the triggers * from the list of triggers associated with the event. Called * when a trigger file is opened in truncate mode. * * @set_filter: An optional function called to parse and set a filter * for the trigger. If no @set_filter() method is set for the * event command, filters set by the user for the command will be * ignored. This is usually implemented by the generic utility * function @set_trigger_filter() (see trace_event_triggers.c). * * @get_trigger_ops: The callback function invoked to retrieve the * event_trigger_ops implementation associated with the command. */ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; int flags; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); int (*reg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg)(char *glob, struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); void (*unreg_all)(struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; /** * enum event_command_flags - flags for struct event_command * * @POST_TRIGGER: A flag that says whether or not this command needs * to have its action delayed until after the current event has * been closed. Some triggers need to avoid being invoked while * an event is currently in the process of being logged, since * the trigger may itself log data into the trace buffer. Thus * we make sure the current event is committed before invoking * those triggers. To do that, the trigger invocation is split * in two - the first part checks the filter using the current * trace record; if a command has the @post_trigger flag set, it * sets a bit for itself in the return value, otherwise it * directly invokes the trigger. Once all commands have been * either invoked or set their return flag, the current record is * either committed or discarded. At that point, if any commands * have deferred their triggers, those commands are finally * invoked following the close of the current event. In other * words, if the event_trigger_ops @func() probe implementation * itself logs to the trace buffer, this flag should be set, * otherwise it can be left unspecified. * * @NEEDS_REC: A flag that says whether or not this command needs * access to the trace record in order to perform its function, * regardless of whether or not it has a filter associated with * it (filters make a trigger require access to the trace record * but are not always present). */ enum event_command_flags { EVENT_CMD_FL_POST_TRIGGER = 1, EVENT_CMD_FL_NEEDS_REC = 2, }; static inline bool event_command_post_trigger(struct event_command *cmd_ops) { return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; } static inline bool event_command_needs_rec(struct event_command *cmd_ops) { return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; } extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); extern void tracing_snapshot_cond(struct trace_array *tr, void *cond_data); extern int tracing_snapshot_cond_enable(struct trace_array *tr, void *cond_data, cond_update_fn_t update); extern int tracing_snapshot_cond_disable(struct trace_array *tr); extern void *tracing_cond_snapshot_data(struct trace_array *tr); extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; extern const char *__start___tracepoint_str[]; extern const char *__stop___tracepoint_str[]; void trace_printk_control(bool enabled); void trace_printk_start_comm(void); int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); /* Used from boot time tracer */ extern int trace_set_options(struct trace_array *tr, char *option); extern int tracing_set_tracer(struct trace_array *tr, const char *buf); extern ssize_t tracing_resize_ring_buffer(struct trace_array *tr, unsigned long size, int cpu_id); extern int tracing_set_cpumask(struct trace_array *tr, cpumask_var_t tracing_cpumask_new); #define MAX_EVENT_NAME_LEN 64 extern int trace_run_command(const char *buf, int (*createfn)(int, char**)); extern ssize_t trace_parse_run_command(struct file *file, const char __user *buffer, size_t count, loff_t *ppos, int (*createfn)(int, char**)); extern unsigned int err_pos(char *cmd, const char *str); extern void tracing_log_err(struct trace_array *tr, const char *loc, const char *cmd, const char **errs, u8 type, u8 pos); /* * Normal trace_printk() and friends allocates special buffers * to do the manipulation, as well as saves the print formats * into sections to display. But the trace infrastructure wants * to use these without the added overhead at the price of being * a bit slower (used mainly for warnings, where we don't care * about performance). The internal_trace_puts() is for such * a purpose. */ #define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ extern struct trace_event_call \ __aligned(4) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #undef FTRACE_ENTRY_PACKED #define FTRACE_ENTRY_PACKED(call, struct_name, id, tstruct, print) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) #include "trace_entries.h" #if defined(CONFIG_PERF_EVENTS) && defined(CONFIG_FUNCTION_TRACER) int perf_ftrace_event_register(struct trace_event_call *call, enum trace_reg type, void *data); #else #define perf_ftrace_event_register NULL #endif #ifdef CONFIG_FTRACE_SYSCALLS void init_ftrace_syscalls(void); const char *get_syscall_name(int syscall); #else static inline void init_ftrace_syscalls(void) { } static inline const char *get_syscall_name(int syscall) { return NULL; } #endif #ifdef CONFIG_EVENT_TRACING void trace_event_init(void); void trace_event_eval_update(struct trace_eval_map **map, int len); /* Used from boot time tracer */ extern int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set); extern int trigger_process_regex(struct trace_event_file *file, char *buff); #else static inline void __init trace_event_init(void) { } static inline void trace_event_eval_update(struct trace_eval_map **map, int len) { } #endif #ifdef CONFIG_TRACER_SNAPSHOT void tracing_snapshot_instance(struct trace_array *tr); int tracing_alloc_snapshot_instance(struct trace_array *tr); #else static inline void tracing_snapshot_instance(struct trace_array *tr) { } static inline int tracing_alloc_snapshot_instance(struct trace_array *tr) { return 0; } #endif #ifdef CONFIG_PREEMPT_TRACER void tracer_preempt_on(unsigned long a0, unsigned long a1); void tracer_preempt_off(unsigned long a0, unsigned long a1); #else static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } #endif #ifdef CONFIG_IRQSOFF_TRACER void tracer_hardirqs_on(unsigned long a0, unsigned long a1); void tracer_hardirqs_off(unsigned long a0, unsigned long a1); #else static inline void tracer_hardirqs_on(unsigned long a0, unsigned long a1) { } static inline void tracer_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif extern struct trace_iterator *tracepoint_print_iter; /* * Reset the state of the trace_iterator so that it can read consumed data. * Normally, the trace_iterator is used for reading the data when it is not * consumed, and must retain state. */ static __always_inline void trace_iterator_reset(struct trace_iterator *iter) { const size_t offset = offsetof(struct trace_iterator, seq); /* * Keep gcc from complaining about overwriting more than just one * member in the structure. */ memset((char *)iter + offset, 0, sizeof(struct trace_iterator) - offset); iter->pos = -1; } /* Check the name is good for event/group/fields */ static inline bool is_good_name(const char *name) { if (!isalpha(*name) && *name != '_') return false; while (*++name != '\0') { if (!isalpha(*name) && !isdigit(*name) && *name != '_') return false; } return true; } #endif /* _LINUX_KERNEL_TRACE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM ext4 #if !defined(_TRACE_EXT4_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_EXT4_H #include <linux/writeback.h> #include <linux/tracepoint.h> struct ext4_allocation_context; struct ext4_allocation_request; struct ext4_extent; struct ext4_prealloc_space; struct ext4_inode_info; struct mpage_da_data; struct ext4_map_blocks; struct extent_status; struct ext4_fsmap; struct partial_cluster; #define EXT4_I(inode) (container_of(inode, struct ext4_inode_info, vfs_inode)) #define show_mballoc_flags(flags) __print_flags(flags, "|", \ { EXT4_MB_HINT_MERGE, "HINT_MERGE" }, \ { EXT4_MB_HINT_RESERVED, "HINT_RESV" }, \ { EXT4_MB_HINT_METADATA, "HINT_MDATA" }, \ { EXT4_MB_HINT_FIRST, "HINT_FIRST" }, \ { EXT4_MB_HINT_BEST, "HINT_BEST" }, \ { EXT4_MB_HINT_DATA, "HINT_DATA" }, \ { EXT4_MB_HINT_NOPREALLOC, "HINT_NOPREALLOC" }, \ { EXT4_MB_HINT_GROUP_ALLOC, "HINT_GRP_ALLOC" }, \ { EXT4_MB_HINT_GOAL_ONLY, "HINT_GOAL_ONLY" }, \ { EXT4_MB_HINT_TRY_GOAL, "HINT_TRY_GOAL" }, \ { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ { EXT4_MB_USE_RESERVED, "USE_RESV" }, \ { EXT4_MB_STRICT_CHECK, "STRICT_CHECK" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ { EXT4_GET_BLOCKS_UNWRIT_EXT, "UNWRIT" }, \ { EXT4_GET_BLOCKS_DELALLOC_RESERVE, "DELALLOC" }, \ { EXT4_GET_BLOCKS_PRE_IO, "PRE_IO" }, \ { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \ { EXT4_GET_BLOCKS_ZERO, "ZERO" }, \ { EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \ { EXT4_EX_NOCACHE, "EX_NOCACHE" }) /* * __print_flags() requires that all enum values be wrapped in the * TRACE_DEFINE_ENUM macro so that the enum value can be encoded in the ftrace * ring buffer. */ TRACE_DEFINE_ENUM(BH_New); TRACE_DEFINE_ENUM(BH_Mapped); TRACE_DEFINE_ENUM(BH_Unwritten); TRACE_DEFINE_ENUM(BH_Boundary); #define show_mflags(flags) __print_flags(flags, "", \ { EXT4_MAP_NEW, "N" }, \ { EXT4_MAP_MAPPED, "M" }, \ { EXT4_MAP_UNWRITTEN, "U" }, \ { EXT4_MAP_BOUNDARY, "B" }) #define show_free_flags(flags) __print_flags(flags, "|", \ { EXT4_FREE_BLOCKS_METADATA, "METADATA" }, \ { EXT4_FREE_BLOCKS_FORGET, "FORGET" }, \ { EXT4_FREE_BLOCKS_VALIDATED, "VALIDATED" }, \ { EXT4_FREE_BLOCKS_NO_QUOT_UPDATE, "NO_QUOTA" }, \ { EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER,"1ST_CLUSTER" },\ { EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER, "LAST_CLUSTER" }) TRACE_DEFINE_ENUM(ES_WRITTEN_B); TRACE_DEFINE_ENUM(ES_UNWRITTEN_B); TRACE_DEFINE_ENUM(ES_DELAYED_B); TRACE_DEFINE_ENUM(ES_HOLE_B); TRACE_DEFINE_ENUM(ES_REFERENCED_B); #define show_extent_status(status) __print_flags(status, "", \ { EXTENT_STATUS_WRITTEN, "W" }, \ { EXTENT_STATUS_UNWRITTEN, "U" }, \ { EXTENT_STATUS_DELAYED, "D" }, \ { EXTENT_STATUS_HOLE, "H" }, \ { EXTENT_STATUS_REFERENCED, "R" }) #define show_falloc_mode(mode) __print_flags(mode, "|", \ { FALLOC_FL_KEEP_SIZE, "KEEP_SIZE"}, \ { FALLOC_FL_PUNCH_HOLE, "PUNCH_HOLE"}, \ { FALLOC_FL_NO_HIDE_STALE, "NO_HIDE_STALE"}, \ { FALLOC_FL_COLLAPSE_RANGE, "COLLAPSE_RANGE"}, \ { FALLOC_FL_ZERO_RANGE, "ZERO_RANGE"}) #define show_fc_reason(reason) \ __print_symbolic(reason, \ { EXT4_FC_REASON_XATTR, "XATTR"}, \ { EXT4_FC_REASON_CROSS_RENAME, "CROSS_RENAME"}, \ { EXT4_FC_REASON_JOURNAL_FLAG_CHANGE, "JOURNAL_FLAG_CHANGE"}, \ { EXT4_FC_REASON_NOMEM, "NO_MEM"}, \ { EXT4_FC_REASON_SWAP_BOOT, "SWAP_BOOT"}, \ { EXT4_FC_REASON_RESIZE, "RESIZE"}, \ { EXT4_FC_REASON_RENAME_DIR, "RENAME_DIR"}, \ { EXT4_FC_REASON_FALLOC_RANGE, "FALLOC_RANGE"}, \ { EXT4_FC_REASON_INODE_JOURNAL_DATA, "INODE_JOURNAL_DATA"}) TRACE_EVENT(ext4_other_inode_update_time, TP_PROTO(struct inode *inode, ino_t orig_ino), TP_ARGS(inode, orig_ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, orig_ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u16, mode ) ), TP_fast_assign( __entry->orig_ino = orig_ino; __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->orig_ino, (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid) ); TRACE_EVENT(ext4_free_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( uid_t, uid ) __field( gid_t, gid ) __field( __u64, blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->uid = i_uid_read(inode); __entry->gid = i_gid_read(inode); __entry->blocks = inode->i_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o uid %u gid %u blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->uid, __entry->gid, __entry->blocks) ); TRACE_EVENT(ext4_request_inode, TP_PROTO(struct inode *dir, int mode), TP_ARGS(dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_allocate_inode, TP_PROTO(struct inode *inode, struct inode *dir, int mode), TP_ARGS(inode, dir, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, dir ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->dir = dir->i_ino; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu dir %lu mode 0%o", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->dir, __entry->mode) ); TRACE_EVENT(ext4_evict_inode, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, nlink ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nlink = inode->i_nlink; ), TP_printk("dev %d,%d ino %lu nlink %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nlink) ); TRACE_EVENT(ext4_drop_inode, TP_PROTO(struct inode *inode, int drop), TP_ARGS(inode, drop), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, drop ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->drop = drop; ), TP_printk("dev %d,%d ino %lu drop %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->drop) ); TRACE_EVENT(ext4_nfs_commit_metadata, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; ), TP_printk("dev %d,%d ino %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_mark_inode_dirty, TP_PROTO(struct inode *inode, unsigned long IP), TP_ARGS(inode, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field(unsigned long, ip ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ip = IP; ), TP_printk("dev %d,%d ino %lu caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (void *)__entry->ip) ); TRACE_EVENT(ext4_begin_ordered_truncate, TP_PROTO(struct inode *inode, loff_t new_size), TP_ARGS(inode, new_size), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, new_size ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->new_size = new_size; ), TP_printk("dev %d,%d ino %lu new_size %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->new_size) ); DECLARE_EVENT_CLASS(ext4__write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int flags), TP_ARGS(inode, pos, len, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->flags = flags; ), TP_printk("dev %d,%d ino %lu pos %lld len %u flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->flags) ); DEFINE_EVENT(ext4__write_begin, ext4_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int flags), TP_ARGS(inode, pos, len, flags) ); DEFINE_EVENT(ext4__write_begin, ext4_da_write_begin, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int flags), TP_ARGS(inode, pos, len, flags) ); DECLARE_EVENT_CLASS(ext4__write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, len ) __field( unsigned int, copied ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = pos; __entry->len = len; __entry->copied = copied; ), TP_printk("dev %d,%d ino %lu pos %lld len %u copied %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->copied) ); DEFINE_EVENT(ext4__write_end, ext4_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_journalled_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); DEFINE_EVENT(ext4__write_end, ext4_da_write_end, TP_PROTO(struct inode *inode, loff_t pos, unsigned int len, unsigned int copied), TP_ARGS(inode, pos, len, copied) ); TRACE_EVENT(ext4_writepages, TP_PROTO(struct inode *inode, struct writeback_control *wbc), TP_ARGS(inode, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( long, nr_to_write ) __field( long, pages_skipped ) __field( loff_t, range_start ) __field( loff_t, range_end ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) __field( char, for_kupdate ) __field( char, range_cyclic ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->nr_to_write = wbc->nr_to_write; __entry->pages_skipped = wbc->pages_skipped; __entry->range_start = wbc->range_start; __entry->range_end = wbc->range_end; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; __entry->for_kupdate = wbc->for_kupdate; __entry->range_cyclic = wbc->range_cyclic; ), TP_printk("dev %d,%d ino %lu nr_to_write %ld pages_skipped %ld " "range_start %lld range_end %lld sync_mode %d " "for_kupdate %d range_cyclic %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->nr_to_write, __entry->pages_skipped, __entry->range_start, __entry->range_end, __entry->sync_mode, __entry->for_kupdate, __entry->range_cyclic, (unsigned long) __entry->writeback_index) ); TRACE_EVENT(ext4_da_write_pages, TP_PROTO(struct inode *inode, pgoff_t first_page, struct writeback_control *wbc), TP_ARGS(inode, first_page, wbc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, first_page ) __field( long, nr_to_write ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->first_page = first_page; __entry->nr_to_write = wbc->nr_to_write; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu first_page %lu nr_to_write %ld " "sync_mode %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->first_page, __entry->nr_to_write, __entry->sync_mode) ); TRACE_EVENT(ext4_da_write_pages_extent, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map), TP_ARGS(inode, map), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, lblk ) __field( __u32, len ) __field( __u32, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->flags = map->m_flags; ), TP_printk("dev %d,%d ino %lu lblk %llu len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_mflags(__entry->flags)) ); TRACE_EVENT(ext4_writepages_result, TP_PROTO(struct inode *inode, struct writeback_control *wbc, int ret, int pages_written), TP_ARGS(inode, wbc, ret, pages_written), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) __field( int, pages_written ) __field( long, pages_skipped ) __field( pgoff_t, writeback_index ) __field( int, sync_mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; __entry->pages_written = pages_written; __entry->pages_skipped = wbc->pages_skipped; __entry->writeback_index = inode->i_mapping->writeback_index; __entry->sync_mode = wbc->sync_mode; ), TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld " "sync_mode %d writeback_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret, __entry->pages_written, __entry->pages_skipped, __entry->sync_mode, (unsigned long) __entry->writeback_index) ); DECLARE_EVENT_CLASS(ext4__page_op, TP_PROTO(struct page *page), TP_ARGS(page), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) ), TP_fast_assign( __entry->dev = page->mapping->host->i_sb->s_dev; __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; ), TP_printk("dev %d,%d ino %lu page_index %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index) ); DEFINE_EVENT(ext4__page_op, ext4_writepage, TP_PROTO(struct page *page), TP_ARGS(page) ); DEFINE_EVENT(ext4__page_op, ext4_readpage, TP_PROTO(struct page *page), TP_ARGS(page) ); DEFINE_EVENT(ext4__page_op, ext4_releasepage, TP_PROTO(struct page *page), TP_ARGS(page) ); DECLARE_EVENT_CLASS(ext4_invalidatepage_op, TP_PROTO(struct page *page, unsigned int offset, unsigned int length), TP_ARGS(page, offset, length), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( pgoff_t, index ) __field( unsigned int, offset ) __field( unsigned int, length ) ), TP_fast_assign( __entry->dev = page->mapping->host->i_sb->s_dev; __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; __entry->offset = offset; __entry->length = length; ), TP_printk("dev %d,%d ino %lu page_index %lu offset %u length %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->index, __entry->offset, __entry->length) ); DEFINE_EVENT(ext4_invalidatepage_op, ext4_invalidatepage, TP_PROTO(struct page *page, unsigned int offset, unsigned int length), TP_ARGS(page, offset, length) ); DEFINE_EVENT(ext4_invalidatepage_op, ext4_journalled_invalidatepage, TP_PROTO(struct page *page, unsigned int offset, unsigned int length), TP_ARGS(page, offset, length) ); TRACE_EVENT(ext4_discard_blocks, TP_PROTO(struct super_block *sb, unsigned long long blk, unsigned long long count), TP_ARGS(sb, blk, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, blk ) __field( __u64, count ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->blk = blk; __entry->count = count; ), TP_printk("dev %d,%d blk %llu count %llu", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blk, __entry->count) ); DECLARE_EVENT_CLASS(ext4__mb_new_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, pa_pstart ) __field( __u64, pa_lstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = ac->ac_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->pa_pstart = pa->pa_pstart; __entry->pa_lstart = pa->pa_lstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d ino %lu pstart %llu len %u lstart %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pa_pstart, __entry->pa_len, __entry->pa_lstart) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_inode_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, TP_PROTO(struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), TP_ARGS(ac, pa) ); TRACE_EVENT(ext4_mb_release_inode_pa, TP_PROTO(struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), TP_ARGS(pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( __u32, count ) ), TP_fast_assign( __entry->dev = pa->pa_inode->i_sb->s_dev; __entry->ino = pa->pa_inode->i_ino; __entry->block = block; __entry->count = count; ), TP_printk("dev %d,%d ino %lu block %llu count %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->block, __entry->count) ); TRACE_EVENT(ext4_mb_release_group_pa, TP_PROTO(struct super_block *sb, struct ext4_prealloc_space *pa), TP_ARGS(sb, pa), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u64, pa_pstart ) __field( __u32, pa_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), TP_printk("dev %d,%d pstart %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->pa_pstart, __entry->pa_len) ); TRACE_EVENT(ext4_discard_preallocations, TP_PROTO(struct inode *inode, unsigned int len, unsigned int needed), TP_ARGS(inode, len, needed), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) __field( unsigned int, needed ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->len = len; __entry->needed = needed; ), TP_printk("dev %d,%d ino %lu len: %u needed %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->len, __entry->needed) ); TRACE_EVENT(ext4_mb_discard_preallocations, TP_PROTO(struct super_block *sb, int needed), TP_ARGS(sb, needed), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, needed ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->needed = needed; ), TP_printk("dev %d,%d needed %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->needed) ); TRACE_EVENT(ext4_request_blocks, TP_PROTO(struct ext4_allocation_request *ar), TP_ARGS(ar), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u lblk %u goal %llu " "lleft %u lright %u pleft %llu pright %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_allocate_blocks, TP_PROTO(struct ext4_allocation_request *ar, unsigned long long block), TP_ARGS(ar, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned int, len ) __field( __u32, logical ) __field( __u32, lleft ) __field( __u32, lright ) __field( __u64, goal ) __field( __u64, pleft ) __field( __u64, pright ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = ar->inode->i_sb->s_dev; __entry->ino = ar->inode->i_ino; __entry->block = block; __entry->len = ar->len; __entry->logical = ar->logical; __entry->goal = ar->goal; __entry->lleft = ar->lleft; __entry->lright = ar->lright; __entry->pleft = ar->pleft; __entry->pright = ar->pright; __entry->flags = ar->flags; ), TP_printk("dev %d,%d ino %lu flags %s len %u block %llu lblk %u " "goal %llu lleft %u lright %u pleft %llu pright %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_mballoc_flags(__entry->flags), __entry->len, __entry->block, __entry->logical, __entry->goal, __entry->lleft, __entry->lright, __entry->pleft, __entry->pright) ); TRACE_EVENT(ext4_free_blocks, TP_PROTO(struct inode *inode, __u64 block, unsigned long count, int flags), TP_ARGS(inode, block, count, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( unsigned long, count ) __field( int, flags ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->count = count; __entry->flags = flags; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o block %llu count %lu flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->block, __entry->count, show_free_flags(__entry->flags)) ); TRACE_EVENT(ext4_sync_file_enter, TP_PROTO(struct file *file, int datasync), TP_ARGS(file, datasync), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( int, datasync ) ), TP_fast_assign( struct dentry *dentry = file->f_path.dentry; __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->datasync = datasync; __entry->parent = d_inode(dentry->d_parent)->i_ino; ), TP_printk("dev %d,%d ino %lu parent %lu datasync %d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long) __entry->parent, __entry->datasync) ); TRACE_EVENT(ext4_sync_file_exit, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); TRACE_EVENT(ext4_sync_fs, TP_PROTO(struct super_block *sb, int wait), TP_ARGS(sb, wait), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, wait ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->wait = wait; ), TP_printk("dev %d,%d wait %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->wait) ); TRACE_EVENT(ext4_alloc_da_blocks, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, data_blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->data_blocks = EXT4_I(inode)->i_reserved_data_blocks; ), TP_printk("dev %d,%d ino %lu reserved_data_blocks %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->data_blocks) ); TRACE_EVENT(ext4_mballoc_alloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, goal_logical ) __field( int, goal_start ) __field( __u32, goal_group ) __field( int, goal_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) __field( __u16, found ) __field( __u16, groups ) __field( __u16, buddy ) __field( __u16, flags ) __field( __u16, tail ) __field( __u8, cr ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->goal_logical = ac->ac_g_ex.fe_logical; __entry->goal_start = ac->ac_g_ex.fe_start; __entry->goal_group = ac->ac_g_ex.fe_group; __entry->goal_len = ac->ac_g_ex.fe_len; __entry->result_logical = ac->ac_f_ex.fe_logical; __entry->result_start = ac->ac_f_ex.fe_start; __entry->result_group = ac->ac_f_ex.fe_group; __entry->result_len = ac->ac_f_ex.fe_len; __entry->found = ac->ac_found; __entry->flags = ac->ac_flags; __entry->groups = ac->ac_groups_scanned; __entry->buddy = ac->ac_buddy; __entry->tail = ac->ac_tail; __entry->cr = ac->ac_criteria; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u goal %u/%d/%u@%u " "result %u/%d/%u@%u blks %u grps %u cr %u flags %s " "tail %u broken %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->goal_group, __entry->goal_start, __entry->goal_len, __entry->goal_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical, __entry->found, __entry->groups, __entry->cr, show_mballoc_flags(__entry->flags), __entry->tail, __entry->buddy ? 1 << __entry->buddy : 0) ); TRACE_EVENT(ext4_mballoc_prealloc, TP_PROTO(struct ext4_allocation_context *ac), TP_ARGS(ac), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u32, orig_logical ) __field( int, orig_start ) __field( __u32, orig_group ) __field( int, orig_len ) __field( __u32, result_logical ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = ac->ac_inode->i_sb->s_dev; __entry->ino = ac->ac_inode->i_ino; __entry->orig_logical = ac->ac_o_ex.fe_logical; __entry->orig_start = ac->ac_o_ex.fe_start; __entry->orig_group = ac->ac_o_ex.fe_group; __entry->orig_len = ac->ac_o_ex.fe_len; __entry->result_logical = ac->ac_b_ex.fe_logical; __entry->result_start = ac->ac_b_ex.fe_start; __entry->result_group = ac->ac_b_ex.fe_group; __entry->result_len = ac->ac_b_ex.fe_len; ), TP_printk("dev %d,%d inode %lu orig %u/%d/%u@%u result %u/%d/%u@%u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->orig_group, __entry->orig_start, __entry->orig_len, __entry->orig_logical, __entry->result_group, __entry->result_start, __entry->result_len, __entry->result_logical) ); DECLARE_EVENT_CLASS(ext4__mballoc, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, result_start ) __field( __u32, result_group ) __field( int, result_len ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = inode ? inode->i_ino : 0; __entry->result_start = start; __entry->result_group = group; __entry->result_len = len; ), TP_printk("dev %d,%d inode %lu extent %u/%d/%d ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->result_group, __entry->result_start, __entry->result_len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_discard, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); DEFINE_EVENT(ext4__mballoc, ext4_mballoc_free, TP_PROTO(struct super_block *sb, struct inode *inode, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, inode, group, start, len) ); TRACE_EVENT(ext4_forget, TP_PROTO(struct inode *inode, int is_metadata, __u64 block), TP_ARGS(inode, is_metadata, block), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, block ) __field( int, is_metadata ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->block = block; __entry->is_metadata = is_metadata; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o is_metadata %d block %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->is_metadata, __entry->block) ); TRACE_EVENT(ext4_da_update_reserve_space, TP_PROTO(struct inode *inode, int used_blocks, int quota_claim), TP_ARGS(inode, used_blocks, quota_claim), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, used_blocks ) __field( int, reserved_data_blocks ) __field( int, quota_claim ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->used_blocks = used_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->quota_claim = quota_claim; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu used_blocks %d " "reserved_data_blocks %d quota_claim %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->used_blocks, __entry->reserved_data_blocks, __entry->quota_claim) ); TRACE_EVENT(ext4_da_reserve_space, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu " "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->reserved_data_blocks) ); TRACE_EVENT(ext4_da_release_space, TP_PROTO(struct inode *inode, int freed_blocks), TP_ARGS(inode, freed_blocks), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, i_blocks ) __field( int, freed_blocks ) __field( int, reserved_data_blocks ) __field( __u16, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->i_blocks = inode->i_blocks; __entry->freed_blocks = freed_blocks; __entry->reserved_data_blocks = EXT4_I(inode)->i_reserved_data_blocks; __entry->mode = inode->i_mode; ), TP_printk("dev %d,%d ino %lu mode 0%o i_blocks %llu freed_blocks %d " "reserved_data_blocks %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->mode, __entry->i_blocks, __entry->freed_blocks, __entry->reserved_data_blocks) ); DECLARE_EVENT_CLASS(ext4__bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_mb_buddy_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); DEFINE_EVENT(ext4__bitmap_load, ext4_load_inode_bitmap, TP_PROTO(struct super_block *sb, unsigned long group), TP_ARGS(sb, group) ); TRACE_EVENT(ext4_read_block_bitmap_load, TP_PROTO(struct super_block *sb, unsigned long group, bool prefetch), TP_ARGS(sb, group, prefetch), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( bool, prefetch ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->prefetch = prefetch; ), TP_printk("dev %d,%d group %u prefetch %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->prefetch) ); TRACE_EVENT(ext4_direct_IO_enter, TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw), TP_ARGS(inode, offset, len, rw), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned long, len ) __field( int, rw ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->len = len; __entry->rw = rw; ), TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->rw) ); TRACE_EVENT(ext4_direct_IO_exit, TP_PROTO(struct inode *inode, loff_t offset, unsigned long len, int rw, int ret), TP_ARGS(inode, offset, len, rw, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned long, len ) __field( int, rw ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->len = len; __entry->rw = rw; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu pos %lld len %lu rw %d ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->len, __entry->rw, __entry->ret) ); DECLARE_EVENT_CLASS(ext4__fallocate_mode, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, offset ) __field( loff_t, len ) __field( int, mode ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; __entry->mode = mode; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld mode %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len, show_falloc_mode(__entry->mode)) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_fallocate_enter, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_punch_hole, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); DEFINE_EVENT(ext4__fallocate_mode, ext4_zero_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int mode), TP_ARGS(inode, offset, len, mode) ); TRACE_EVENT(ext4_fallocate_exit, TP_PROTO(struct inode *inode, loff_t offset, unsigned int max_blocks, int ret), TP_ARGS(inode, offset, max_blocks, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, pos ) __field( unsigned int, blocks ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pos = offset; __entry->blocks = max_blocks; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu pos %lld blocks %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->pos, __entry->blocks, __entry->ret) ); TRACE_EVENT(ext4_unlink_enter, TP_PROTO(struct inode *parent, struct dentry *dentry), TP_ARGS(parent, dentry), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ino_t, parent ) __field( loff_t, size ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->parent = parent->i_ino; __entry->size = d_inode(dentry)->i_size; ), TP_printk("dev %d,%d ino %lu size %lld parent %lu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->size, (unsigned long) __entry->parent) ); TRACE_EVENT(ext4_unlink_exit, TP_PROTO(struct dentry *dentry, int ret), TP_ARGS(dentry, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, ret ) ), TP_fast_assign( __entry->dev = dentry->d_sb->s_dev; __entry->ino = d_inode(dentry)->i_ino; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->ret) ); DECLARE_EVENT_CLASS(ext4__truncate, TP_PROTO(struct inode *inode), TP_ARGS(inode), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( __u64, blocks ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->blocks = inode->i_blocks; ), TP_printk("dev %d,%d ino %lu blocks %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->blocks) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_enter, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); DEFINE_EVENT(ext4__truncate, ext4_truncate_exit, TP_PROTO(struct inode *inode), TP_ARGS(inode) ); /* 'ux' is the unwritten extent. */ TRACE_EVENT(ext4_ext_convert_to_initialized_enter, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux), TP_ARGS(inode, map, ux), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u u_lblk %u u_len %u " "u_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk) ); /* * 'ux' is the unwritten extent. * 'ix' is the initialized extent to which blocks are transferred. */ TRACE_EVENT(ext4_ext_convert_to_initialized_fastpath, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, struct ext4_extent *ux, struct ext4_extent *ix), TP_ARGS(inode, map, ux, ix), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, m_lblk ) __field( unsigned, m_len ) __field( ext4_lblk_t, u_lblk ) __field( unsigned, u_len ) __field( ext4_fsblk_t, u_pblk ) __field( ext4_lblk_t, i_lblk ) __field( unsigned, i_len ) __field( ext4_fsblk_t, i_pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_len = map->m_len; __entry->u_lblk = le32_to_cpu(ux->ee_block); __entry->u_len = ext4_ext_get_actual_len(ux); __entry->u_pblk = ext4_ext_pblock(ux); __entry->i_lblk = le32_to_cpu(ix->ee_block); __entry->i_len = ext4_ext_get_actual_len(ix); __entry->i_pblk = ext4_ext_pblock(ix); ), TP_printk("dev %d,%d ino %lu m_lblk %u m_len %u " "u_lblk %u u_len %u u_pblk %llu " "i_lblk %u i_len %u i_pblk %llu ", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->m_lblk, __entry->m_len, __entry->u_lblk, __entry->u_len, __entry->u_pblk, __entry->i_lblk, __entry->i_len, __entry->i_pblk) ); DECLARE_EVENT_CLASS(ext4__map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, unsigned int flags), TP_ARGS(inode, lblk, len, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, flags ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; __entry->flags = flags; ), TP_printk("dev %d,%d ino %lu lblk %u len %u flags %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, show_map_flags(__entry->flags)) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ext_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DEFINE_EVENT(ext4__map_blocks_enter, ext4_ind_map_blocks_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned len, unsigned flags), TP_ARGS(inode, lblk, len, flags) ); DECLARE_EVENT_CLASS(ext4__map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( unsigned int, flags ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( unsigned int, mflags ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->pblk = map->m_pblk; __entry->lblk = map->m_lblk; __entry->len = map->m_len; __entry->mflags = map->m_flags; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu flags %s lblk %u pblk %llu len %u " "mflags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, show_map_flags(__entry->flags), __entry->lblk, __entry->pblk, __entry->len, show_mflags(__entry->mflags), __entry->ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ext_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); DEFINE_EVENT(ext4__map_blocks_exit, ext4_ind_map_blocks_exit, TP_PROTO(struct inode *inode, unsigned flags, struct ext4_map_blocks *map, int ret), TP_ARGS(inode, flags, map, ret) ); TRACE_EVENT(ext4_ext_load_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk), TP_ARGS(inode, lblk, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->pblk) ); TRACE_EVENT(ext4_load_inode, TP_PROTO(struct super_block *sb, unsigned long ino), TP_ARGS(sb, ino), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ino = ino; ), TP_printk("dev %d,%d ino %ld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino) ); TRACE_EVENT(ext4_journal_start, TP_PROTO(struct super_block *sb, int blocks, int rsv_blocks, int revoke_creds, unsigned long IP), TP_ARGS(sb, blocks, rsv_blocks, revoke_creds, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, ip ) __field( int, blocks ) __field( int, rsv_blocks ) __field( int, revoke_creds ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; __entry->rsv_blocks = rsv_blocks; __entry->revoke_creds = revoke_creds; ), TP_printk("dev %d,%d blocks %d, rsv_blocks %d, revoke_creds %d, " "caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, __entry->rsv_blocks, __entry->revoke_creds, (void *)__entry->ip) ); TRACE_EVENT(ext4_journal_start_reserved, TP_PROTO(struct super_block *sb, int blocks, unsigned long IP), TP_ARGS(sb, blocks, IP), TP_STRUCT__entry( __field( dev_t, dev ) __field(unsigned long, ip ) __field( int, blocks ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->ip = IP; __entry->blocks = blocks; ), TP_printk("dev %d,%d blocks, %d caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->blocks, (void *)__entry->ip) ); DECLARE_EVENT_CLASS(ext4__trim, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len), TP_STRUCT__entry( __field( int, dev_major ) __field( int, dev_minor ) __field( __u32, group ) __field( int, start ) __field( int, len ) ), TP_fast_assign( __entry->dev_major = MAJOR(sb->s_dev); __entry->dev_minor = MINOR(sb->s_dev); __entry->group = group; __entry->start = start; __entry->len = len; ), TP_printk("dev %d,%d group %u, start %d, len %d", __entry->dev_major, __entry->dev_minor, __entry->group, __entry->start, __entry->len) ); DEFINE_EVENT(ext4__trim, ext4_trim_extent, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); DEFINE_EVENT(ext4__trim, ext4_trim_all_free, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_grpblk_t start, ext4_grpblk_t len), TP_ARGS(sb, group, start, len) ); TRACE_EVENT(ext4_ext_handle_unwritten_extents, TP_PROTO(struct inode *inode, struct ext4_map_blocks *map, int flags, unsigned int allocated, ext4_fsblk_t newblock), TP_ARGS(inode, map, flags, allocated, newblock), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( unsigned int, allocated ) __field( ext4_fsblk_t, newblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->flags = flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->allocated = allocated; __entry->newblk = newblock; ), TP_printk("dev %d,%d ino %lu m_lblk %u m_pblk %llu m_len %u flags %s " "allocated %d newblock %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_map_flags(__entry->flags), (unsigned int) __entry->allocated, (unsigned long long) __entry->newblk) ); TRACE_EVENT(ext4_get_implied_cluster_alloc_exit, TP_PROTO(struct super_block *sb, struct ext4_map_blocks *map, int ret), TP_ARGS(sb, map, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned int, flags ) __field( ext4_lblk_t, lblk ) __field( ext4_fsblk_t, pblk ) __field( unsigned int, len ) __field( int, ret ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = map->m_flags; __entry->lblk = map->m_lblk; __entry->pblk = map->m_pblk; __entry->len = map->m_len; __entry->ret = ret; ), TP_printk("dev %d,%d m_lblk %u m_pblk %llu m_len %u m_flags %s ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->lblk, (unsigned long long) __entry->pblk, __entry->len, show_mflags(__entry->flags), __entry->ret) ); TRACE_EVENT(ext4_ext_put_in_cache, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len, ext4_fsblk_t start), TP_ARGS(inode, lblk, len, start), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) __field( ext4_fsblk_t, start ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; __entry->start = start; ), TP_printk("dev %d,%d ino %lu lblk %u len %u start %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, __entry->len, (unsigned long long) __entry->start) ); TRACE_EVENT(ext4_ext_in_cache, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, int ret), TP_ARGS(inode, lblk, ret), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( int, ret ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->ret = ret; ), TP_printk("dev %d,%d ino %lu lblk %u ret %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, __entry->ret) ); TRACE_EVENT(ext4_find_delalloc_range, TP_PROTO(struct inode *inode, ext4_lblk_t from, ext4_lblk_t to, int reverse, int found, ext4_lblk_t found_blk), TP_ARGS(inode, from, to, reverse, found, found_blk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) __field( int, reverse ) __field( int, found ) __field( ext4_lblk_t, found_blk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; __entry->reverse = reverse; __entry->found = found; __entry->found_blk = found_blk; ), TP_printk("dev %d,%d ino %lu from %u to %u reverse %d found %d " "(blk = %u)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->from, (unsigned) __entry->to, __entry->reverse, __entry->found, (unsigned) __entry->found_blk) ); TRACE_EVENT(ext4_get_reserved_cluster_alloc, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, unsigned int len), TP_ARGS(inode, lblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( unsigned int, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu lblk %u len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, __entry->len) ); TRACE_EVENT(ext4_ext_show_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, unsigned short len), TP_ARGS(inode, lblk, pblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) __field( ext4_lblk_t, lblk ) __field( unsigned short, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu lblk %u pblk %llu len %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->lblk, (unsigned long long) __entry->pblk, (unsigned short) __entry->len) ); TRACE_EVENT(ext4_remove_blocks, TP_PROTO(struct inode *inode, struct ext4_extent *ex, ext4_lblk_t from, ext4_fsblk_t to, struct partial_cluster *pc), TP_ARGS(inode, ex, from, to, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, from ) __field( ext4_lblk_t, to ) __field( ext4_fsblk_t, ee_pblk ) __field( ext4_lblk_t, ee_lblk ) __field( unsigned short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->from = from; __entry->to = to; __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu extent [%u(%llu), %u]" "from %u to %u partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (unsigned) __entry->from, (unsigned) __entry->to, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_leaf, TP_PROTO(struct inode *inode, ext4_lblk_t start, struct ext4_extent *ex, struct partial_cluster *pc), TP_ARGS(inode, start, ex, pc), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, ee_lblk ) __field( ext4_fsblk_t, ee_pblk ) __field( short, ee_len ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->ee_lblk = le32_to_cpu(ex->ee_block); __entry->ee_pblk = ext4_ext_pblock(ex); __entry->ee_len = ext4_ext_get_actual_len(ex); __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; ), TP_printk("dev %d,%d ino %lu start_lblk %u last_extent [%u(%llu), %u]" "partial [pclu %lld lblk %u state %d]", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->ee_lblk, (unsigned long long) __entry->ee_pblk, (unsigned short) __entry->ee_len, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state) ); TRACE_EVENT(ext4_ext_rm_idx, TP_PROTO(struct inode *inode, ext4_fsblk_t pblk), TP_ARGS(inode, pblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_fsblk_t, pblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->pblk = pblk; ), TP_printk("dev %d,%d ino %lu index_pblk %llu", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned long long) __entry->pblk) ); TRACE_EVENT(ext4_ext_remove_space, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth), TP_ARGS(inode, start, end, depth), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth) ); TRACE_EVENT(ext4_ext_remove_space_done, TP_PROTO(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end, int depth, struct partial_cluster *pc, __le16 eh_entries), TP_ARGS(inode, start, end, depth, pc, eh_entries), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, start ) __field( ext4_lblk_t, end ) __field( int, depth ) __field( ext4_fsblk_t, pc_pclu ) __field( ext4_lblk_t, pc_lblk ) __field( int, pc_state ) __field( unsigned short, eh_entries ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->depth = depth; __entry->pc_pclu = pc->pclu; __entry->pc_lblk = pc->lblk; __entry->pc_state = pc->state; __entry->eh_entries = le16_to_cpu(eh_entries); ), TP_printk("dev %d,%d ino %lu since %u end %u depth %d " "partial [pclu %lld lblk %u state %d] " "remaining_entries %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, (unsigned) __entry->start, (unsigned) __entry->end, __entry->depth, (long long) __entry->pc_pclu, (unsigned int) __entry->pc_lblk, (int) __entry->pc_state, (unsigned short) __entry->eh_entries) ); DECLARE_EVENT_CLASS(ext4__es_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); DEFINE_EVENT(ext4__es_extent, ext4_es_insert_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); DEFINE_EVENT(ext4__es_extent, ext4_es_cache_extent, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es) ); TRACE_EVENT(ext4_es_remove_extent, TP_PROTO(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len), TP_ARGS(inode, lblk, len), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( loff_t, lblk ) __field( loff_t, len ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; __entry->len = len; ), TP_printk("dev %d,%d ino %lu es [%lld/%lld)", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len) ); TRACE_EVENT(ext4_es_find_extent_range_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_find_extent_range_exit, TP_PROTO(struct inode *inode, struct extent_status *es), TP_ARGS(inode, es), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status)) ); TRACE_EVENT(ext4_es_lookup_extent_enter, TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_ARGS(inode, lblk), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = lblk; ), TP_printk("dev %d,%d ino %lu lblk %u", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk) ); TRACE_EVENT(ext4_es_lookup_extent_exit, TP_PROTO(struct inode *inode, struct extent_status *es, int found), TP_ARGS(inode, es, found), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( int, found ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->found = found; ), TP_printk("dev %d,%d ino %lu found %d [%u/%u) %llu %s", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->found, __entry->lblk, __entry->len, __entry->found ? __entry->pblk : 0, show_extent_status(__entry->found ? __entry->status : 0)) ); DECLARE_EVENT_CLASS(ext4__es_shrink_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_to_scan ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_to_scan = nr_to_scan; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_to_scan %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_to_scan, __entry->cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_count, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); DEFINE_EVENT(ext4__es_shrink_enter, ext4_es_shrink_scan_enter, TP_PROTO(struct super_block *sb, int nr_to_scan, int cache_cnt), TP_ARGS(sb, nr_to_scan, cache_cnt) ); TRACE_EVENT(ext4_es_shrink_scan_exit, TP_PROTO(struct super_block *sb, int nr_shrunk, int cache_cnt), TP_ARGS(sb, nr_shrunk, cache_cnt), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( int, cache_cnt ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->cache_cnt = cache_cnt; ), TP_printk("dev %d,%d nr_shrunk %d cache_cnt %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->cache_cnt) ); TRACE_EVENT(ext4_collapse_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_insert_range, TP_PROTO(struct inode *inode, loff_t offset, loff_t len), TP_ARGS(inode, offset, len), TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) __field(loff_t, offset) __field(loff_t, len) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->offset = offset; __entry->len = len; ), TP_printk("dev %d,%d ino %lu offset %lld len %lld", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->offset, __entry->len) ); TRACE_EVENT(ext4_es_shrink, TP_PROTO(struct super_block *sb, int nr_shrunk, u64 scan_time, int nr_skipped, int retried), TP_ARGS(sb, nr_shrunk, scan_time, nr_skipped, retried), TP_STRUCT__entry( __field( dev_t, dev ) __field( int, nr_shrunk ) __field( unsigned long long, scan_time ) __field( int, nr_skipped ) __field( int, retried ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nr_shrunk = nr_shrunk; __entry->scan_time = div_u64(scan_time, 1000); __entry->nr_skipped = nr_skipped; __entry->retried = retried; ), TP_printk("dev %d,%d nr_shrunk %d, scan_time %llu " "nr_skipped %d retried %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nr_shrunk, __entry->scan_time, __entry->nr_skipped, __entry->retried) ); TRACE_EVENT(ext4_es_insert_delayed_block, TP_PROTO(struct inode *inode, struct extent_status *es, bool allocated), TP_ARGS(inode, es, allocated), TP_STRUCT__entry( __field( dev_t, dev ) __field( ino_t, ino ) __field( ext4_lblk_t, lblk ) __field( ext4_lblk_t, len ) __field( ext4_fsblk_t, pblk ) __field( char, status ) __field( bool, allocated ) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->lblk = es->es_lblk; __entry->len = es->es_len; __entry->pblk = ext4_es_show_pblock(es); __entry->status = ext4_es_status(es); __entry->allocated = allocated; ), TP_printk("dev %d,%d ino %lu es [%u/%u) mapped %llu status %s " "allocated %d", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino, __entry->lblk, __entry->len, __entry->pblk, show_extent_status(__entry->status), __entry->allocated) ); /* fsmap traces */ DECLARE_EVENT_CLASS(ext4_fsmap_class, TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, u64 owner), TP_ARGS(sb, keydev, agno, bno, len, owner), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u32, agno) __field(u64, bno) __field(u64, len) __field(u64, owner) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(keydev); __entry->agno = agno; __entry->bno = bno; __entry->len = len; __entry->owner = owner; ), TP_printk("dev %d:%d keydev %d:%d agno %u bno %llu len %llu owner %lld\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->agno, __entry->bno, __entry->len, __entry->owner) ) #define DEFINE_FSMAP_EVENT(name) \ DEFINE_EVENT(ext4_fsmap_class, name, \ TP_PROTO(struct super_block *sb, u32 keydev, u32 agno, u64 bno, u64 len, \ u64 owner), \ TP_ARGS(sb, keydev, agno, bno, len, owner)) DEFINE_FSMAP_EVENT(ext4_fsmap_low_key); DEFINE_FSMAP_EVENT(ext4_fsmap_high_key); DEFINE_FSMAP_EVENT(ext4_fsmap_mapping); DECLARE_EVENT_CLASS(ext4_getfsmap_class, TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), TP_ARGS(sb, fsmap), TP_STRUCT__entry( __field(dev_t, dev) __field(dev_t, keydev) __field(u64, block) __field(u64, len) __field(u64, owner) __field(u64, flags) ), TP_fast_assign( __entry->dev = sb->s_bdev->bd_dev; __entry->keydev = new_decode_dev(fsmap->fmr_device); __entry->block = fsmap->fmr_physical; __entry->len = fsmap->fmr_length; __entry->owner = fsmap->fmr_owner; __entry->flags = fsmap->fmr_flags; ), TP_printk("dev %d:%d keydev %d:%d block %llu len %llu owner %lld flags 0x%llx\n", MAJOR(__entry->dev), MINOR(__entry->dev), MAJOR(__entry->keydev), MINOR(__entry->keydev), __entry->block, __entry->len, __entry->owner, __entry->flags) ) #define DEFINE_GETFSMAP_EVENT(name) \ DEFINE_EVENT(ext4_getfsmap_class, name, \ TP_PROTO(struct super_block *sb, struct ext4_fsmap *fsmap), \ TP_ARGS(sb, fsmap)) DEFINE_GETFSMAP_EVENT(ext4_getfsmap_low_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_high_key); DEFINE_GETFSMAP_EVENT(ext4_getfsmap_mapping); TRACE_EVENT(ext4_shutdown, TP_PROTO(struct super_block *sb, unsigned long flags), TP_ARGS(sb, flags), TP_STRUCT__entry( __field( dev_t, dev ) __field( unsigned, flags ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->flags = flags; ), TP_printk("dev %d,%d flags %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->flags) ); TRACE_EVENT(ext4_error, TP_PROTO(struct super_block *sb, const char *function, unsigned int line), TP_ARGS(sb, function, line), TP_STRUCT__entry( __field( dev_t, dev ) __field( const char *, function ) __field( unsigned, line ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->function = function; __entry->line = line; ), TP_printk("dev %d,%d function %s line %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->function, __entry->line) ); TRACE_EVENT(ext4_prefetch_bitmaps, TP_PROTO(struct super_block *sb, ext4_group_t group, ext4_group_t next, unsigned int prefetch_ios), TP_ARGS(sb, group, next, prefetch_ios), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) __field( __u32, next ) __field( __u32, ios ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; __entry->next = next; __entry->ios = prefetch_ios; ), TP_printk("dev %d,%d group %u next %u ios %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group, __entry->next, __entry->ios) ); TRACE_EVENT(ext4_lazy_itable_init, TP_PROTO(struct super_block *sb, ext4_group_t group), TP_ARGS(sb, group), TP_STRUCT__entry( __field( dev_t, dev ) __field( __u32, group ) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->group = group; ), TP_printk("dev %d,%d group %u", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->group) ); TRACE_EVENT(ext4_fc_replay_scan, TP_PROTO(struct super_block *sb, int error, int off), TP_ARGS(sb, error, off), TP_STRUCT__entry( __field(dev_t, dev) __field(int, error) __field(int, off) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->error = error; __entry->off = off; ), TP_printk("FC scan pass on dev %d,%d: error %d, off %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->error, __entry->off) ); TRACE_EVENT(ext4_fc_replay, TP_PROTO(struct super_block *sb, int tag, int ino, int priv1, int priv2), TP_ARGS(sb, tag, ino, priv1, priv2), TP_STRUCT__entry( __field(dev_t, dev) __field(int, tag) __field(int, ino) __field(int, priv1) __field(int, priv2) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->tag = tag; __entry->ino = ino; __entry->priv1 = priv1; __entry->priv2 = priv2; ), TP_printk("FC Replay %d,%d: tag %d, ino %d, data1 %d, data2 %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->tag, __entry->ino, __entry->priv1, __entry->priv2) ); TRACE_EVENT(ext4_fc_commit_start, TP_PROTO(struct super_block *sb), TP_ARGS(sb), TP_STRUCT__entry( __field(dev_t, dev) ), TP_fast_assign( __entry->dev = sb->s_dev; ), TP_printk("fast_commit started on dev %d,%d", MAJOR(__entry->dev), MINOR(__entry->dev)) ); TRACE_EVENT(ext4_fc_commit_stop, TP_PROTO(struct super_block *sb, int nblks, int reason), TP_ARGS(sb, nblks, reason), TP_STRUCT__entry( __field(dev_t, dev) __field(int, nblks) __field(int, reason) __field(int, num_fc) __field(int, num_fc_ineligible) __field(int, nblks_agg) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->nblks = nblks; __entry->reason = reason; __entry->num_fc = EXT4_SB(sb)->s_fc_stats.fc_num_commits; __entry->num_fc_ineligible = EXT4_SB(sb)->s_fc_stats.fc_ineligible_commits; __entry->nblks_agg = EXT4_SB(sb)->s_fc_stats.fc_numblks; ), TP_printk("fc on [%d,%d] nblks %d, reason %d, fc = %d, ineligible = %d, agg_nblks %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->nblks, __entry->reason, __entry->num_fc, __entry->num_fc_ineligible, __entry->nblks_agg) ); #define FC_REASON_NAME_STAT(reason) \ show_fc_reason(reason), \ __entry->sbi->s_fc_stats.fc_ineligible_reason_count[reason] TRACE_EVENT(ext4_fc_stats, TP_PROTO(struct super_block *sb), TP_ARGS(sb), TP_STRUCT__entry( __field(dev_t, dev) __field(struct ext4_sb_info *, sbi) __field(int, count) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->sbi = EXT4_SB(sb); ), TP_printk("dev %d:%d fc ineligible reasons:\n" "%s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d, %s:%d; " "num_commits:%ld, ineligible: %ld, numblks: %ld", MAJOR(__entry->dev), MINOR(__entry->dev), FC_REASON_NAME_STAT(EXT4_FC_REASON_XATTR), FC_REASON_NAME_STAT(EXT4_FC_REASON_CROSS_RENAME), FC_REASON_NAME_STAT(EXT4_FC_REASON_JOURNAL_FLAG_CHANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_NOMEM), FC_REASON_NAME_STAT(EXT4_FC_REASON_SWAP_BOOT), FC_REASON_NAME_STAT(EXT4_FC_REASON_RESIZE), FC_REASON_NAME_STAT(EXT4_FC_REASON_RENAME_DIR), FC_REASON_NAME_STAT(EXT4_FC_REASON_FALLOC_RANGE), FC_REASON_NAME_STAT(EXT4_FC_REASON_INODE_JOURNAL_DATA), __entry->sbi->s_fc_stats.fc_num_commits, __entry->sbi->s_fc_stats.fc_ineligible_commits, __entry->sbi->s_fc_stats.fc_numblks) ); #define DEFINE_TRACE_DENTRY_EVENT(__type) \ TRACE_EVENT(ext4_fc_track_##__type, \ TP_PROTO(struct inode *inode, struct dentry *dentry, int ret), \ \ TP_ARGS(inode, dentry, ret), \ \ TP_STRUCT__entry( \ __field(dev_t, dev) \ __field(int, ino) \ __field(int, error) \ ), \ \ TP_fast_assign( \ __entry->dev = inode->i_sb->s_dev; \ __entry->ino = inode->i_ino; \ __entry->error = ret; \ ), \ \ TP_printk("dev %d:%d, inode %d, error %d, fc_%s", \ MAJOR(__entry->dev), MINOR(__entry->dev), \ __entry->ino, __entry->error, \ #__type) \ ) DEFINE_TRACE_DENTRY_EVENT(create); DEFINE_TRACE_DENTRY_EVENT(link); DEFINE_TRACE_DENTRY_EVENT(unlink); TRACE_EVENT(ext4_fc_track_inode, TP_PROTO(struct inode *inode, int ret), TP_ARGS(inode, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(int, ino) __field(int, error) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->error = ret; ), TP_printk("dev %d:%d, inode %d, error %d", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->error) ); TRACE_EVENT(ext4_fc_track_range, TP_PROTO(struct inode *inode, long start, long end, int ret), TP_ARGS(inode, start, end, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(int, ino) __field(long, start) __field(long, end) __field(int, error) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; __entry->start = start; __entry->end = end; __entry->error = ret; ), TP_printk("dev %d:%d, inode %d, error %d, start %ld, end %ld", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->error, __entry->start, __entry->end) ); #endif /* _TRACE_EXT4_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM scsi #if !defined(_TRACE_SCSI_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCSI_H #include <scsi/scsi_cmnd.h> #include <scsi/scsi_host.h> #include <linux/tracepoint.h> #include <linux/trace_seq.h> #define scsi_opcode_name(opcode) { opcode, #opcode } #define show_opcode_name(val) \ __print_symbolic(val, \ scsi_opcode_name(TEST_UNIT_READY), \ scsi_opcode_name(REZERO_UNIT), \ scsi_opcode_name(REQUEST_SENSE), \ scsi_opcode_name(FORMAT_UNIT), \ scsi_opcode_name(READ_BLOCK_LIMITS), \ scsi_opcode_name(REASSIGN_BLOCKS), \ scsi_opcode_name(INITIALIZE_ELEMENT_STATUS), \ scsi_opcode_name(READ_6), \ scsi_opcode_name(WRITE_6), \ scsi_opcode_name(SEEK_6), \ scsi_opcode_name(READ_REVERSE), \ scsi_opcode_name(WRITE_FILEMARKS), \ scsi_opcode_name(SPACE), \ scsi_opcode_name(INQUIRY), \ scsi_opcode_name(RECOVER_BUFFERED_DATA), \ scsi_opcode_name(MODE_SELECT), \ scsi_opcode_name(RESERVE), \ scsi_opcode_name(RELEASE), \ scsi_opcode_name(COPY), \ scsi_opcode_name(ERASE), \ scsi_opcode_name(MODE_SENSE), \ scsi_opcode_name(START_STOP), \ scsi_opcode_name(RECEIVE_DIAGNOSTIC), \ scsi_opcode_name(SEND_DIAGNOSTIC), \ scsi_opcode_name(ALLOW_MEDIUM_REMOVAL), \ scsi_opcode_name(SET_WINDOW), \ scsi_opcode_name(READ_CAPACITY), \ scsi_opcode_name(READ_10), \ scsi_opcode_name(WRITE_10), \ scsi_opcode_name(SEEK_10), \ scsi_opcode_name(POSITION_TO_ELEMENT), \ scsi_opcode_name(WRITE_VERIFY), \ scsi_opcode_name(VERIFY), \ scsi_opcode_name(SEARCH_HIGH), \ scsi_opcode_name(SEARCH_EQUAL), \ scsi_opcode_name(SEARCH_LOW), \ scsi_opcode_name(SET_LIMITS), \ scsi_opcode_name(PRE_FETCH), \ scsi_opcode_name(READ_POSITION), \ scsi_opcode_name(SYNCHRONIZE_CACHE), \ scsi_opcode_name(LOCK_UNLOCK_CACHE), \ scsi_opcode_name(READ_DEFECT_DATA), \ scsi_opcode_name(MEDIUM_SCAN), \ scsi_opcode_name(COMPARE), \ scsi_opcode_name(COPY_VERIFY), \ scsi_opcode_name(WRITE_BUFFER), \ scsi_opcode_name(READ_BUFFER), \ scsi_opcode_name(UPDATE_BLOCK), \ scsi_opcode_name(READ_LONG), \ scsi_opcode_name(WRITE_LONG), \ scsi_opcode_name(CHANGE_DEFINITION), \ scsi_opcode_name(WRITE_SAME), \ scsi_opcode_name(UNMAP), \ scsi_opcode_name(READ_TOC), \ scsi_opcode_name(LOG_SELECT), \ scsi_opcode_name(LOG_SENSE), \ scsi_opcode_name(XDWRITEREAD_10), \ scsi_opcode_name(MODE_SELECT_10), \ scsi_opcode_name(RESERVE_10), \ scsi_opcode_name(RELEASE_10), \ scsi_opcode_name(MODE_SENSE_10), \ scsi_opcode_name(PERSISTENT_RESERVE_IN), \ scsi_opcode_name(PERSISTENT_RESERVE_OUT), \ scsi_opcode_name(VARIABLE_LENGTH_CMD), \ scsi_opcode_name(REPORT_LUNS), \ scsi_opcode_name(MAINTENANCE_IN), \ scsi_opcode_name(MAINTENANCE_OUT), \ scsi_opcode_name(MOVE_MEDIUM), \ scsi_opcode_name(EXCHANGE_MEDIUM), \ scsi_opcode_name(READ_12), \ scsi_opcode_name(WRITE_12), \ scsi_opcode_name(WRITE_VERIFY_12), \ scsi_opcode_name(SEARCH_HIGH_12), \ scsi_opcode_name(SEARCH_EQUAL_12), \ scsi_opcode_name(SEARCH_LOW_12), \ scsi_opcode_name(READ_ELEMENT_STATUS), \ scsi_opcode_name(SEND_VOLUME_TAG), \ scsi_opcode_name(WRITE_LONG_2), \ scsi_opcode_name(READ_16), \ scsi_opcode_name(WRITE_16), \ scsi_opcode_name(VERIFY_16), \ scsi_opcode_name(WRITE_SAME_16), \ scsi_opcode_name(ZBC_OUT), \ scsi_opcode_name(ZBC_IN), \ scsi_opcode_name(SERVICE_ACTION_IN_16), \ scsi_opcode_name(READ_32), \ scsi_opcode_name(WRITE_32), \ scsi_opcode_name(WRITE_SAME_32), \ scsi_opcode_name(ATA_16), \ scsi_opcode_name(ATA_12)) #define scsi_hostbyte_name(result) { result, #result } #define show_hostbyte_name(val) \ __print_symbolic(val, \ scsi_hostbyte_name(DID_OK), \ scsi_hostbyte_name(DID_NO_CONNECT), \ scsi_hostbyte_name(DID_BUS_BUSY), \ scsi_hostbyte_name(DID_TIME_OUT), \ scsi_hostbyte_name(DID_BAD_TARGET), \ scsi_hostbyte_name(DID_ABORT), \ scsi_hostbyte_name(DID_PARITY), \ scsi_hostbyte_name(DID_ERROR), \ scsi_hostbyte_name(DID_RESET), \ scsi_hostbyte_name(DID_BAD_INTR), \ scsi_hostbyte_name(DID_PASSTHROUGH), \ scsi_hostbyte_name(DID_SOFT_ERROR), \ scsi_hostbyte_name(DID_IMM_RETRY), \ scsi_hostbyte_name(DID_REQUEUE), \ scsi_hostbyte_name(DID_TRANSPORT_DISRUPTED), \ scsi_hostbyte_name(DID_TRANSPORT_FAILFAST)) #define scsi_driverbyte_name(result) { result, #result } #define show_driverbyte_name(val) \ __print_symbolic(val, \ scsi_driverbyte_name(DRIVER_OK), \ scsi_driverbyte_name(DRIVER_BUSY), \ scsi_driverbyte_name(DRIVER_SOFT), \ scsi_driverbyte_name(DRIVER_MEDIA), \ scsi_driverbyte_name(DRIVER_ERROR), \ scsi_driverbyte_name(DRIVER_INVALID), \ scsi_driverbyte_name(DRIVER_TIMEOUT), \ scsi_driverbyte_name(DRIVER_HARD), \ scsi_driverbyte_name(DRIVER_SENSE)) #define scsi_msgbyte_name(result) { result, #result } #define show_msgbyte_name(val) \ __print_symbolic(val, \ scsi_msgbyte_name(COMMAND_COMPLETE), \ scsi_msgbyte_name(EXTENDED_MESSAGE), \ scsi_msgbyte_name(SAVE_POINTERS), \ scsi_msgbyte_name(RESTORE_POINTERS), \ scsi_msgbyte_name(DISCONNECT), \ scsi_msgbyte_name(INITIATOR_ERROR), \ scsi_msgbyte_name(ABORT_TASK_SET), \ scsi_msgbyte_name(MESSAGE_REJECT), \ scsi_msgbyte_name(NOP), \ scsi_msgbyte_name(MSG_PARITY_ERROR), \ scsi_msgbyte_name(LINKED_CMD_COMPLETE), \ scsi_msgbyte_name(LINKED_FLG_CMD_COMPLETE), \ scsi_msgbyte_name(TARGET_RESET), \ scsi_msgbyte_name(ABORT_TASK), \ scsi_msgbyte_name(CLEAR_TASK_SET), \ scsi_msgbyte_name(INITIATE_RECOVERY), \ scsi_msgbyte_name(RELEASE_RECOVERY), \ scsi_msgbyte_name(CLEAR_ACA), \ scsi_msgbyte_name(LOGICAL_UNIT_RESET), \ scsi_msgbyte_name(SIMPLE_QUEUE_TAG), \ scsi_msgbyte_name(HEAD_OF_QUEUE_TAG), \ scsi_msgbyte_name(ORDERED_QUEUE_TAG), \ scsi_msgbyte_name(IGNORE_WIDE_RESIDUE), \ scsi_msgbyte_name(ACA), \ scsi_msgbyte_name(QAS_REQUEST), \ scsi_msgbyte_name(BUS_DEVICE_RESET), \ scsi_msgbyte_name(ABORT)) #define scsi_statusbyte_name(result) { result, #result } #define show_statusbyte_name(val) \ __print_symbolic(val, \ scsi_statusbyte_name(SAM_STAT_GOOD), \ scsi_statusbyte_name(SAM_STAT_CHECK_CONDITION), \ scsi_statusbyte_name(SAM_STAT_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_BUSY), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE), \ scsi_statusbyte_name(SAM_STAT_INTERMEDIATE_CONDITION_MET), \ scsi_statusbyte_name(SAM_STAT_RESERVATION_CONFLICT), \ scsi_statusbyte_name(SAM_STAT_COMMAND_TERMINATED), \ scsi_statusbyte_name(SAM_STAT_TASK_SET_FULL), \ scsi_statusbyte_name(SAM_STAT_ACA_ACTIVE), \ scsi_statusbyte_name(SAM_STAT_TASK_ABORTED)) #define scsi_prot_op_name(result) { result, #result } #define show_prot_op_name(val) \ __print_symbolic(val, \ scsi_prot_op_name(SCSI_PROT_NORMAL), \ scsi_prot_op_name(SCSI_PROT_READ_INSERT), \ scsi_prot_op_name(SCSI_PROT_WRITE_STRIP), \ scsi_prot_op_name(SCSI_PROT_READ_STRIP), \ scsi_prot_op_name(SCSI_PROT_WRITE_INSERT), \ scsi_prot_op_name(SCSI_PROT_READ_PASS), \ scsi_prot_op_name(SCSI_PROT_WRITE_PASS)) const char *scsi_trace_parse_cdb(struct trace_seq*, unsigned char*, int); #define __parse_cdb(cdb, len) scsi_trace_parse_cdb(p, cdb, len) TRACE_EVENT(scsi_dispatch_cmd_start, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s cmnd=(%s %s raw=%s)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len)) ); TRACE_EVENT(scsi_dispatch_cmd_error, TP_PROTO(struct scsi_cmnd *cmd, int rtn), TP_ARGS(cmd, rtn), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, rtn ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->rtn = rtn; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u prot_sgl=%u" \ " prot_op=%s cmnd=(%s %s raw=%s) rtn=%d", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), __entry->rtn) ); DECLARE_EVENT_CLASS(scsi_cmd_done_timeout_template, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd), TP_STRUCT__entry( __field( unsigned int, host_no ) __field( unsigned int, channel ) __field( unsigned int, id ) __field( unsigned int, lun ) __field( int, result ) __field( unsigned int, opcode ) __field( unsigned int, cmd_len ) __field( unsigned int, data_sglen ) __field( unsigned int, prot_sglen ) __field( unsigned char, prot_op ) __dynamic_array(unsigned char, cmnd, cmd->cmd_len) ), TP_fast_assign( __entry->host_no = cmd->device->host->host_no; __entry->channel = cmd->device->channel; __entry->id = cmd->device->id; __entry->lun = cmd->device->lun; __entry->result = cmd->result; __entry->opcode = cmd->cmnd[0]; __entry->cmd_len = cmd->cmd_len; __entry->data_sglen = scsi_sg_count(cmd); __entry->prot_sglen = scsi_prot_sg_count(cmd); __entry->prot_op = scsi_get_prot_op(cmd); memcpy(__get_dynamic_array(cmnd), cmd->cmnd, cmd->cmd_len); ), TP_printk("host_no=%u channel=%u id=%u lun=%u data_sgl=%u " \ "prot_sgl=%u prot_op=%s cmnd=(%s %s raw=%s) result=(driver=" \ "%s host=%s message=%s status=%s)", __entry->host_no, __entry->channel, __entry->id, __entry->lun, __entry->data_sglen, __entry->prot_sglen, show_prot_op_name(__entry->prot_op), show_opcode_name(__entry->opcode), __parse_cdb(__get_dynamic_array(cmnd), __entry->cmd_len), __print_hex(__get_dynamic_array(cmnd), __entry->cmd_len), show_driverbyte_name(((__entry->result) >> 24) & 0xff), show_hostbyte_name(((__entry->result) >> 16) & 0xff), show_msgbyte_name(((__entry->result) >> 8) & 0xff), show_statusbyte_name(__entry->result & 0xff)) ); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_done, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); DEFINE_EVENT(scsi_cmd_done_timeout_template, scsi_dispatch_cmd_timeout, TP_PROTO(struct scsi_cmnd *cmd), TP_ARGS(cmd)); TRACE_EVENT(scsi_eh_wakeup, TP_PROTO(struct Scsi_Host *shost), TP_ARGS(shost), TP_STRUCT__entry( __field( unsigned int, host_no ) ), TP_fast_assign( __entry->host_no = shost->host_no; ), TP_printk("host_no=%u", __entry->host_no) ); #endif /* _TRACE_SCSI_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 /* SPDX-License-Identifier: GPL-2.0-only */ #ifndef _NET_ETHTOOL_NETLINK_H #define _NET_ETHTOOL_NETLINK_H #include <linux/ethtool_netlink.h> #include <linux/netdevice.h> #include <net/genetlink.h> #include <net/sock.h> struct ethnl_req_info; int ethnl_parse_header_dev_get(struct ethnl_req_info *req_info, const struct nlattr *nest, struct net *net, struct netlink_ext_ack *extack, bool require_dev); int ethnl_fill_reply_header(struct sk_buff *skb, struct net_device *dev, u16 attrtype); struct sk_buff *ethnl_reply_init(size_t payload, struct net_device *dev, u8 cmd, u16 hdr_attrtype, struct genl_info *info, void **ehdrp); void *ethnl_dump_put(struct sk_buff *skb, struct netlink_callback *cb, u8 cmd); void *ethnl_bcastmsg_put(struct sk_buff *skb, u8 cmd); int ethnl_multicast(struct sk_buff *skb, struct net_device *dev); /** * ethnl_strz_size() - calculate attribute length for fixed size string * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Return: total length of an attribute with null terminated string from @s */ static inline int ethnl_strz_size(const char *s) { return nla_total_size(strnlen(s, ETH_GSTRING_LEN) + 1); } /** * ethnl_put_strz() - put string attribute with fixed size string * @skb: skb with the message * @attrype: attribute type * @s: ETH_GSTRING_LEN sized string (may not be null terminated) * * Puts an attribute with null terminated string from @s into the message. * * Return: 0 on success, negative error code on failure */ static inline int ethnl_put_strz(struct sk_buff *skb, u16 attrtype, const char *s) { unsigned int len = strnlen(s, ETH_GSTRING_LEN); struct nlattr *attr; attr = nla_reserve(skb, attrtype, len + 1); if (!attr) return -EMSGSIZE; memcpy(nla_data(attr), s, len); ((char *)nla_data(attr))[len] = '\0'; return 0; } /** * ethnl_update_u32() - update u32 value from NLA_U32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u32 value from NLA_U32 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u32(u32 *dst, const struct nlattr *attr, bool *mod) { u32 val; if (!attr) return; val = nla_get_u32(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_u8() - update u8 value from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Copy the u8 value from NLA_U8 netlink attribute @attr into variable * pointed to by @dst; do nothing if @attr is null. Bool pointed to by @mod * is set to true if this function changed the value of *dst, otherwise it * is left as is. */ static inline void ethnl_update_u8(u8 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = nla_get_u8(attr); if (*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_bool32() - update u32 used as bool from NLA_U8 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to set u32 variable * pointed to by @dst to 0 (if zero) or 1 (if not); do nothing if @attr is * null. Bool pointed to by @mod is set to true if this function changed the * logical value of *dst, otherwise it is left as is. */ static inline void ethnl_update_bool32(u32 *dst, const struct nlattr *attr, bool *mod) { u8 val; if (!attr) return; val = !!nla_get_u8(attr); if (!!*dst == val) return; *dst = val; *mod = true; } /** * ethnl_update_binary() - update binary data from NLA_BINARY atribute * @dst: value to update * @len: destination buffer length * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Use the u8 value from NLA_U8 netlink attribute @attr to rewrite data block * of length @len at @dst by attribute payload; do nothing if @attr is null. * Bool pointed to by @mod is set to true if this function changed the logical * value of *dst, otherwise it is left as is. */ static inline void ethnl_update_binary(void *dst, unsigned int len, const struct nlattr *attr, bool *mod) { if (!attr) return; if (nla_len(attr) < len) len = nla_len(attr); if (!memcmp(dst, nla_data(attr), len)) return; memcpy(dst, nla_data(attr), len); *mod = true; } /** * ethnl_update_bitfield32() - update u32 value from NLA_BITFIELD32 attribute * @dst: value to update * @attr: netlink attribute with new value or null * @mod: pointer to bool for modification tracking * * Update bits in u32 value which are set in attribute's mask to values from * attribute's value. Do nothing if @attr is null or the value wouldn't change; * otherwise, set bool pointed to by @mod to true. */ static inline void ethnl_update_bitfield32(u32 *dst, const struct nlattr *attr, bool *mod) { struct nla_bitfield32 change; u32 newval; if (!attr) return; change = nla_get_bitfield32(attr); newval = (*dst & ~change.selector) | (change.value & change.selector); if (*dst == newval) return; *dst = newval; *mod = true; } /** * ethnl_reply_header_size() - total size of reply header * * This is an upper estimate so that we do not need to hold RTNL lock longer * than necessary (to prevent rename between size estimate and composing the * message). Accounts only for device ifindex and name as those are the only * attributes ethnl_fill_reply_header() puts into the reply header. */ static inline unsigned int ethnl_reply_header_size(void) { return nla_total_size(nla_total_size(sizeof(u32)) + nla_total_size(IFNAMSIZ)); } /* GET request handling */ /* Unified processing of GET requests uses two data structures: request info * and reply data. Request info holds information parsed from client request * and its stays constant through all request processing. Reply data holds data * retrieved from ethtool_ops callbacks or other internal sources which is used * to compose the reply. When processing a dump request, request info is filled * only once (when the request message is parsed) but reply data is filled for * each reply message. * * Both structures consist of part common for all request types (struct * ethnl_req_info and struct ethnl_reply_data defined below) and optional * parts specific for each request type. Common part always starts at offset 0. */ /** * struct ethnl_req_info - base type of request information for GET requests * @dev: network device the request is for (may be null) * @flags: request flags common for all request types * * This is a common base for request specific structures holding data from * parsed userspace request. These always embed struct ethnl_req_info at * zero offset. */ struct ethnl_req_info { struct net_device *dev; u32 flags; }; /** * struct ethnl_reply_data - base type of reply data for GET requests * @dev: device for current reply message; in single shot requests it is * equal to &ethnl_req_info.dev; in dumps it's different for each * reply message * * This is a common base for request specific structures holding data for * kernel reply message. These always embed struct ethnl_reply_data at zero * offset. */ struct ethnl_reply_data { struct net_device *dev; }; static inline int ethnl_ops_begin(struct net_device *dev) { if (dev && dev->reg_state == NETREG_UNREGISTERING) return -ENODEV; if (dev && dev->ethtool_ops->begin) return dev->ethtool_ops->begin(dev); else return 0; } static inline void ethnl_ops_complete(struct net_device *dev) { if (dev && dev->ethtool_ops->complete) dev->ethtool_ops->complete(dev); } /** * struct ethnl_request_ops - unified handling of GET requests * @request_cmd: command id for request (GET) * @reply_cmd: command id for reply (GET_REPLY) * @hdr_attr: attribute type for request header * @req_info_size: size of request info * @reply_data_size: size of reply data * @allow_nodev_do: allow non-dump request with no device identification * @parse_request: * Parse request except common header (struct ethnl_req_info). Common * header is already filled on entry, the rest up to @repdata_offset * is zero initialized. This callback should only modify type specific * request info by parsed attributes from request message. * @prepare_data: * Retrieve and prepare data needed to compose a reply message. Calls to * ethtool_ops handlers are limited to this callback. Common reply data * (struct ethnl_reply_data) is filled on entry, type specific part after * it is zero initialized. This callback should only modify the type * specific part of reply data. Device identification from struct * ethnl_reply_data is to be used as for dump requests, it iterates * through network devices while dev member of struct ethnl_req_info * points to the device from client request. * @reply_size: * Estimate reply message size. Returned value must be sufficient for * message payload without common reply header. The callback may returned * estimate higher than actual message size if exact calculation would * not be worth the saved memory space. * @fill_reply: * Fill reply message payload (except for common header) from reply data. * The callback must not generate more payload than previously called * ->reply_size() estimated. * @cleanup_data: * Optional cleanup called when reply data is no longer needed. Can be * used e.g. to free any additional data structures outside the main * structure which were allocated by ->prepare_data(). When processing * dump requests, ->cleanup() is called for each message. * * Description of variable parts of GET request handling when using the * unified infrastructure. When used, a pointer to an instance of this * structure is to be added to &ethnl_default_requests array and generic * handlers ethnl_default_doit(), ethnl_default_dumpit(), * ethnl_default_start() and ethnl_default_done() used in @ethtool_genl_ops; * ethnl_default_notify() can be used in @ethnl_notify_handlers to send * notifications of the corresponding type. */ struct ethnl_request_ops { u8 request_cmd; u8 reply_cmd; u16 hdr_attr; unsigned int req_info_size; unsigned int reply_data_size; bool allow_nodev_do; int (*parse_request)(struct ethnl_req_info *req_info, struct nlattr **tb, struct netlink_ext_ack *extack); int (*prepare_data)(const struct ethnl_req_info *req_info, struct ethnl_reply_data *reply_data, struct genl_info *info); int (*reply_size)(const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); int (*fill_reply)(struct sk_buff *skb, const struct ethnl_req_info *req_info, const struct ethnl_reply_data *reply_data); void (*cleanup_data)(struct ethnl_reply_data *reply_data); }; /* request handlers */ extern const struct ethnl_request_ops ethnl_strset_request_ops; extern const struct ethnl_request_ops ethnl_linkinfo_request_ops; extern const struct ethnl_request_ops ethnl_linkmodes_request_ops; extern const struct ethnl_request_ops ethnl_linkstate_request_ops; extern const struct ethnl_request_ops ethnl_debug_request_ops; extern const struct ethnl_request_ops ethnl_wol_request_ops; extern const struct ethnl_request_ops ethnl_features_request_ops; extern const struct ethnl_request_ops ethnl_privflags_request_ops; extern const struct ethnl_request_ops ethnl_rings_request_ops; extern const struct ethnl_request_ops ethnl_channels_request_ops; extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_strset_get_policy[ETHTOOL_A_STRSET_COUNTS_ONLY + 1]; extern const struct nla_policy ethnl_linkinfo_get_policy[ETHTOOL_A_LINKINFO_HEADER + 1]; extern const struct nla_policy ethnl_linkinfo_set_policy[ETHTOOL_A_LINKINFO_TP_MDIX_CTRL + 1]; extern const struct nla_policy ethnl_linkmodes_get_policy[ETHTOOL_A_LINKMODES_HEADER + 1]; extern const struct nla_policy ethnl_linkmodes_set_policy[ETHTOOL_A_LINKMODES_MASTER_SLAVE_CFG + 1]; extern const struct nla_policy ethnl_linkstate_get_policy[ETHTOOL_A_LINKSTATE_HEADER + 1]; extern const struct nla_policy ethnl_debug_get_policy[ETHTOOL_A_DEBUG_HEADER + 1]; extern const struct nla_policy ethnl_debug_set_policy[ETHTOOL_A_DEBUG_MSGMASK + 1]; extern const struct nla_policy ethnl_wol_get_policy[ETHTOOL_A_WOL_HEADER + 1]; extern const struct nla_policy ethnl_wol_set_policy[ETHTOOL_A_WOL_SOPASS + 1]; extern const struct nla_policy ethnl_features_get_policy[ETHTOOL_A_FEATURES_HEADER + 1]; extern const struct nla_policy ethnl_features_set_policy[ETHTOOL_A_FEATURES_WANTED + 1]; extern const struct nla_policy ethnl_privflags_get_policy[ETHTOOL_A_PRIVFLAGS_HEADER + 1]; extern const struct nla_policy ethnl_privflags_set_policy[ETHTOOL_A_PRIVFLAGS_FLAGS + 1]; extern const struct nla_policy ethnl_rings_get_policy[ETHTOOL_A_RINGS_HEADER + 1]; extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1]; extern const struct nla_policy ethnl_eee_set_policy[ETHTOOL_A_EEE_TX_LPI_TIMER + 1]; extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); int ethnl_set_debug(struct sk_buff *skb, struct genl_info *info); int ethnl_set_wol(struct sk_buff *skb, struct genl_info *info); int ethnl_set_features(struct sk_buff *skb, struct genl_info *info); int ethnl_set_privflags(struct sk_buff *skb, struct genl_info *info); int ethnl_set_rings(struct sk_buff *skb, struct genl_info *info); int ethnl_set_channels(struct sk_buff *skb, struct genl_info *info); int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info); int ethnl_set_pause(struct sk_buff *skb, struct genl_info *info); int ethnl_set_eee(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test(struct sk_buff *skb, struct genl_info *info); int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); #endif /* _NET_ETHTOOL_NETLINK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 /* * DRBG based on NIST SP800-90A * * Copyright Stephan Mueller <smueller@chronox.de>, 2014 * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, and the entire permission notice in its entirety, * including the disclaimer of warranties. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. The name of the author may not be used to endorse or promote * products derived from this software without specific prior * written permission. * * ALTERNATIVELY, this product may be distributed under the terms of * the GNU General Public License, in which case the provisions of the GPL are * required INSTEAD OF the above restrictions. (This clause is * necessary due to a potential bad interaction between the GPL and * the restrictions contained in a BSD-style copyright.) * * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE, ALL OF * WHICH ARE HEREBY DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE * USE OF THIS SOFTWARE, EVEN IF NOT ADVISED OF THE POSSIBILITY OF SUCH * DAMAGE. */ #ifndef _DRBG_H #define _DRBG_H #include <linux/random.h> #include <linux/scatterlist.h> #include <crypto/hash.h> #include <crypto/skcipher.h> #include <linux/module.h> #include <linux/crypto.h> #include <linux/slab.h> #include <crypto/internal/rng.h> #include <crypto/rng.h> #include <linux/fips.h> #include <linux/mutex.h> #include <linux/list.h> #include <linux/workqueue.h> /* * Concatenation Helper and string operation helper * * SP800-90A requires the concatenation of different data. To avoid copying * buffers around or allocate additional memory, the following data structure * is used to point to the original memory with its size. In addition, it * is used to build a linked list. The linked list defines the concatenation * of individual buffers. The order of memory block referenced in that * linked list determines the order of concatenation. */ struct drbg_string { const unsigned char *buf; size_t len; struct list_head list; }; static inline void drbg_string_fill(struct drbg_string *string, const unsigned char *buf, size_t len) { string->buf = buf; string->len = len; INIT_LIST_HEAD(&string->list); } struct drbg_state; typedef uint32_t drbg_flag_t; struct drbg_core { drbg_flag_t flags; /* flags for the cipher */ __u8 statelen; /* maximum state length */ __u8 blocklen_bytes; /* block size of output in bytes */ char cra_name[CRYPTO_MAX_ALG_NAME]; /* mapping to kernel crypto API */ /* kernel crypto API backend cipher name */ char backend_cra_name[CRYPTO_MAX_ALG_NAME]; }; struct drbg_state_ops { int (*update)(struct drbg_state *drbg, struct list_head *seed, int reseed); int (*generate)(struct drbg_state *drbg, unsigned char *buf, unsigned int buflen, struct list_head *addtl); int (*crypto_init)(struct drbg_state *drbg); int (*crypto_fini)(struct drbg_state *drbg); }; struct drbg_test_data { struct drbg_string *testentropy; /* TEST PARAMETER: test entropy */ }; struct drbg_state { struct mutex drbg_mutex; /* lock around DRBG */ unsigned char *V; /* internal state 10.1.1.1 1a) */ unsigned char *Vbuf; /* hash: static value 10.1.1.1 1b) hmac / ctr: key */ unsigned char *C; unsigned char *Cbuf; /* Number of RNG requests since last reseed -- 10.1.1.1 1c) */ size_t reseed_ctr; size_t reseed_threshold; /* some memory the DRBG can use for its operation */ unsigned char *scratchpad; unsigned char *scratchpadbuf; void *priv_data; /* Cipher handle */ struct crypto_skcipher *ctr_handle; /* CTR mode cipher handle */ struct skcipher_request *ctr_req; /* CTR mode request handle */ __u8 *outscratchpadbuf; /* CTR mode output scratchpad */ __u8 *outscratchpad; /* CTR mode aligned outbuf */ struct crypto_wait ctr_wait; /* CTR mode async wait obj */ struct scatterlist sg_in, sg_out; /* CTR mode SGLs */ bool seeded; /* DRBG fully seeded? */ bool pr; /* Prediction resistance enabled? */ bool fips_primed; /* Continuous test primed? */ unsigned char *prev; /* FIPS 140-2 continuous test value */ struct work_struct seed_work; /* asynchronous seeding support */ struct crypto_rng *jent; const struct drbg_state_ops *d_ops; const struct drbg_core *core; struct drbg_string test_data; struct random_ready_callback random_ready; }; static inline __u8 drbg_statelen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->statelen; return 0; } static inline __u8 drbg_blocklen(struct drbg_state *drbg) { if (drbg && drbg->core) return drbg->core->blocklen_bytes; return 0; } static inline __u8 drbg_keylen(struct drbg_state *drbg) { if (drbg && drbg->core) return (drbg->core->statelen - drbg->core->blocklen_bytes); return 0; } static inline size_t drbg_max_request_bytes(struct drbg_state *drbg) { /* SP800-90A requires the limit 2**19 bits, but we return bytes */ return (1 << 16); } static inline size_t drbg_max_addtl(struct drbg_state *drbg) { /* SP800-90A requires 2**35 bytes additional info str / pers str */ #if (__BITS_PER_LONG == 32) /* * SP800-90A allows smaller maximum numbers to be returned -- we * return SIZE_MAX - 1 to allow the verification of the enforcement * of this value in drbg_healthcheck_sanity. */ return (SIZE_MAX - 1); #else return (1UL<<35); #endif } static inline size_t drbg_max_requests(struct drbg_state *drbg) { /* SP800-90A requires 2**48 maximum requests before reseeding */ return (1<<20); } /* * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data. * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl) { return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_generate() to allow the caller to provide additional data and * allow furnishing of test_data * * @drng DRBG handle -- see crypto_rng_get_bytes * @outbuf output buffer -- see crypto_rng_get_bytes * @outlen length of output buffer -- see crypto_rng_get_bytes * @addtl_input additional information string input buffer * @addtllen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_get_bytes */ static inline int crypto_drbg_get_bytes_addtl_test(struct crypto_rng *drng, unsigned char *outbuf, unsigned int outlen, struct drbg_string *addtl, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_generate(drng, addtl->buf, addtl->len, outbuf, outlen); } /* * TEST code * * This is a wrapper to the kernel crypto API function of * crypto_rng_reset() to allow the caller to provide test_data * * @drng DRBG handle -- see crypto_rng_reset * @pers personalization string input buffer * @perslen length of additional information string buffer * @test_data filled test data * * return * see crypto_rng_reset */ static inline int crypto_drbg_reset_test(struct crypto_rng *drng, struct drbg_string *pers, struct drbg_test_data *test_data) { crypto_rng_set_entropy(drng, test_data->testentropy->buf, test_data->testentropy->len); return crypto_rng_reset(drng, pers->buf, pers->len); } /* DRBG type flags */ #define DRBG_CTR ((drbg_flag_t)1<<0) #define DRBG_HMAC ((drbg_flag_t)1<<1) #define DRBG_HASH ((drbg_flag_t)1<<2) #define DRBG_TYPE_MASK (DRBG_CTR | DRBG_HMAC | DRBG_HASH) /* DRBG strength flags */ #define DRBG_STRENGTH128 ((drbg_flag_t)1<<3) #define DRBG_STRENGTH192 ((drbg_flag_t)1<<4) #define DRBG_STRENGTH256 ((drbg_flag_t)1<<5) #define DRBG_STRENGTH_MASK (DRBG_STRENGTH128 | DRBG_STRENGTH192 | \ DRBG_STRENGTH256) enum drbg_prefixes { DRBG_PREFIX0 = 0x00, DRBG_PREFIX1, DRBG_PREFIX2, DRBG_PREFIX3 }; #endif /* _DRBG_H */
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 1991, 1992 Linus Torvalds * Copyright (C) 1994, Karl Keyte: Added support for disk statistics * Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de> * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> * - July2000 * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001 */ /* * This handles all read/write requests to block devices */ #include <linux/kernel.h> #include <linux/module.h> #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/blk-pm.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/kernel_stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/completion.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/writeback.h> #include <linux/task_io_accounting_ops.h> #include <linux/fault-inject.h> #include <linux/list_sort.h> #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> #include <linux/blk-cgroup.h> #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> #include <linux/psi.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #define CREATE_TRACE_POINTS #include <trace/events/block.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-rq-qos.h" struct dentry *blk_debugfs_root; EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd */ static struct workqueue_struct *kblockd_workqueue; /** * blk_queue_flag_set - atomically set a queue flag * @flag: flag to be set * @q: request queue */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_set); /** * blk_queue_flag_clear - atomically clear a queue flag * @flag: flag to be cleared * @q: request queue */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { clear_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_clear); /** * blk_queue_flag_test_and_set - atomically test and set a queue flag * @flag: flag to be set * @q: request queue * * Returns the previous value of @flag - 0 if the flag was not set and 1 if * the flag was already set. */ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) { return test_and_set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; rq->start_time_ns = ktime_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } EXPORT_SYMBOL(blk_rq_init); #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), REQ_OP_NAME(WRITE), REQ_OP_NAME(FLUSH), REQ_OP_NAME(DISCARD), REQ_OP_NAME(SECURE_ERASE), REQ_OP_NAME(ZONE_RESET), REQ_OP_NAME(ZONE_RESET_ALL), REQ_OP_NAME(ZONE_OPEN), REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), REQ_OP_NAME(SCSI_IN), REQ_OP_NAME(SCSI_OUT), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; #undef REQ_OP_NAME /** * blk_op_str - Return string XXX in the REQ_OP_XXX. * @op: REQ_OP_XXX. * * Description: Centralize block layer function to convert REQ_OP_XXX into * string format. Useful in the debugging and tracing bio or request. For * invalid REQ_OP_XXX it returns string "UNKNOWN". */ inline const char *blk_op_str(unsigned int op) { const char *op_str = "UNKNOWN"; if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) op_str = blk_op_name[op]; return op_str; } EXPORT_SYMBOL_GPL(blk_op_str); static const struct { int errno; const char *name; } blk_errors[] = { [BLK_STS_OK] = { 0, "" }, [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, /* zone device specific errors */ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; blk_status_t errno_to_blk_status(int errno) { int i; for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { if (blk_errors[i].errno == errno) return (__force blk_status_t)i; } return BLK_STS_IOERR; } EXPORT_SYMBOL_GPL(errno_to_blk_status); int blk_status_to_errno(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return -EIO; return blk_errors[idx].errno; } EXPORT_SYMBOL_GPL(blk_status_to_errno); static void print_req_error(struct request *req, blk_status_t status, const char *caller) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return; printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " "phys_seg %u prio class %u\n", caller, blk_errors[idx].name, req->rq_disk ? req->rq_disk->disk_name : "?", blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), req->cmd_flags & ~REQ_OP_MASK, req->nr_phys_segments, IOPRIO_PRIO_CLASS(req->ioprio)); } static void req_bio_endio(struct request *rq, struct bio *bio, unsigned int nbytes, blk_status_t error) { if (error) bio->bi_status = error; if (unlikely(rq->rq_flags & RQF_QUIET)) bio_set_flag(bio, BIO_QUIET); bio_advance(bio, nbytes); if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. */ if (bio->bi_iter.bi_size) bio->bi_status = BLK_STS_IOERR; else bio->bi_iter.bi_sector = rq->__sector; } /* don't actually finish bio if it's part of flush sequence */ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) bio_endio(bio); } void blk_dump_rq_flags(struct request *rq, char *msg) { printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, rq->rq_disk ? rq->rq_disk->disk_name : "?", (unsigned long long) rq->cmd_flags); printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, len %u\n", rq->bio, rq->biotail, blk_rq_bytes(rq)); } EXPORT_SYMBOL(blk_dump_rq_flags); /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue * * Description: * The block layer may perform asynchronous callback activity * on a queue, such as calling the unplug function after a timeout. * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising * out of elevator or throttling code. That would require elevator_exit() * and blkcg_exit_queue() to be called with queue lock initialized. * */ void blk_sync_queue(struct request_queue *q) { del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); } EXPORT_SYMBOL(blk_sync_queue); /** * blk_set_pm_only - increment pm_only counter * @q: request queue pointer */ void blk_set_pm_only(struct request_queue *q) { atomic_inc(&q->pm_only); } EXPORT_SYMBOL_GPL(blk_set_pm_only); void blk_clear_pm_only(struct request_queue *q) { int pm_only; pm_only = atomic_dec_return(&q->pm_only); WARN_ON_ONCE(pm_only < 0); if (pm_only == 0) wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_clear_pm_only); /** * blk_put_queue - decrement the request_queue refcount * @q: the request_queue structure to decrement the refcount for * * Decrements the refcount of the request_queue kobject. When this reaches 0 * we'll have blk_release_queue() called. * * Context: Any context, but the last reference must not be dropped from * atomic context. */ void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } EXPORT_SYMBOL(blk_put_queue); void blk_set_queue_dying(struct request_queue *q) { blk_queue_flag_set(QUEUE_FLAG_DYING, q); /* * When queue DYING flag is set, we need to block new req * entering queue, so we call blk_freeze_queue_start() to * prevent I/O from crossing blk_queue_enter(). */ blk_freeze_queue_start(q); if (queue_is_mq(q)) blk_mq_wake_waiters(q); /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); } EXPORT_SYMBOL_GPL(blk_set_queue_dying); /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown * * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and * put it. All future requests will be failed immediately with -ENODEV. * * Context: can sleep */ void blk_cleanup_queue(struct request_queue *q) { /* cannot be called from atomic context */ might_sleep(); WARN_ON_ONCE(blk_queue_registered(q)); /* mark @q DYING, no new request or merges will be allowed afterwards */ blk_set_queue_dying(q); blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); /* * Drain all requests queued before DYING marking. Set DEAD flag to * prevent that blk_mq_run_hw_queues() accesses the hardware queues * after draining finished. */ blk_freeze_queue(q); rq_qos_exit(q); blk_queue_flag_set(QUEUE_FLAG_DEAD, q); /* for synchronous bio-based driver finish in-flight integrity i/o */ blk_flush_integrity(); /* @q won't process any more request, flush async actions */ del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); if (queue_is_mq(q)) blk_mq_exit_queue(q); /* * In theory, request pool of sched_tags belongs to request queue. * However, the current implementation requires tag_set for freeing * requests, so free the pool now. * * Queue has become frozen, there can't be any in-queue requests, so * it is safe to free requests now. */ mutex_lock(&q->sysfs_lock); if (q->elevator) blk_mq_sched_free_requests(q); mutex_unlock(&q->sysfs_lock); percpu_ref_exit(&q->q_usage_counter); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { const bool pm = flags & BLK_MQ_REQ_PM; while (true) { bool success = false; rcu_read_lock(); if (percpu_ref_tryget_live(&q->q_usage_counter)) { /* * The code that increments the pm_only counter is * responsible for ensuring that that counter is * globally visible before the queue is unfrozen. */ if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) || !blk_queue_pm_only(q)) { success = true; } else { percpu_ref_put(&q->q_usage_counter); } } rcu_read_unlock(); if (success) return 0; if (flags & BLK_MQ_REQ_NOWAIT) return -EBUSY; /* * read pair of barrier in blk_freeze_queue_start(), * we need to order reading __PERCPU_REF_DEAD flag of * .q_usage_counter and reading .mq_freeze_depth or * queue dying flag, otherwise the following wait may * never return if the two reads are reordered. */ smp_rmb(); wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && blk_pm_resume_queue(pm, q)) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; } } static inline int bio_queue_enter(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; bool nowait = bio->bi_opf & REQ_NOWAIT; int ret; ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); if (unlikely(ret)) { if (nowait && !blk_queue_dying(q)) bio_wouldblock_error(bio); else bio_io_error(bio); } return ret; } void blk_queue_exit(struct request_queue *q) { percpu_ref_put(&q->q_usage_counter); } static void blk_queue_usage_counter_release(struct percpu_ref *ref) { struct request_queue *q = container_of(ref, struct request_queue, q_usage_counter); wake_up_all(&q->mq_freeze_wq); } static void blk_rq_timed_out_timer(struct timer_list *t) { struct request_queue *q = from_timer(q, t, timeout); kblockd_schedule_work(&q->timeout_work); } static void blk_timeout_work(struct work_struct *work) { } struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; int ret; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) return NULL; q->last_merge = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); if (q->id < 0) goto fail_q; ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); if (ret) goto fail_id; q->backing_dev_info = bdi_alloc(node_id); if (!q->backing_dev_info) goto fail_split; q->stats = blk_alloc_queue_stats(); if (!q->stats) goto fail_stats; q->node = node_id; atomic_set(&q->nr_active_requests_shared_sbitmap, 0); timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif kobject_init(&q->kobj, &blk_queue_ktype); mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); mutex_init(&q->mq_freeze_lock); /* * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) goto fail_bdi; if (blkcg_init_queue(q)) goto fail_ref; blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_MAX_RQ; return q; fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: blk_free_queue_stats(q->stats); fail_stats: bdi_put(q->backing_dev_info); fail_split: bioset_exit(&q->bio_split); fail_id: ida_simple_remove(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } EXPORT_SYMBOL(blk_alloc_queue); /** * blk_get_queue - increment the request_queue refcount * @q: the request_queue structure to increment the refcount for * * Increment the refcount of the request_queue kobject. * * Context: Any context. */ bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { __blk_get_queue(q); return true; } return false; } EXPORT_SYMBOL(blk_get_queue); /** * blk_get_request - allocate a request * @q: request queue to allocate a request for * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. */ struct request *blk_get_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { struct request *req; WARN_ON_ONCE(op & REQ_NOWAIT); WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM)); req = blk_mq_alloc_request(q, op, flags); if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) q->mq_ops->initialize_rq_fn(req); return req; } EXPORT_SYMBOL(blk_get_request); void blk_put_request(struct request *req) { blk_mq_free_request(req); } EXPORT_SYMBOL(blk_put_request); static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; pr_info_ratelimited("attempt to access beyond end of device\n" "%s: rw=%d, want=%llu, limit=%llu\n", bio_devname(bio, b), bio->bi_opf, bio_end_sector(bio), maxsector); } #ifdef CONFIG_FAIL_MAKE_REQUEST static DECLARE_FAULT_ATTR(fail_make_request); static int __init setup_fail_make_request(char *str) { return setup_fault_attr(&fail_make_request, str); } __setup("fail_make_request=", setup_fail_make_request); static bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return part->make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) { struct dentry *dir = fault_create_debugfs_attr("fail_make_request", NULL, &fail_make_request); return PTR_ERR_OR_ZERO(dir); } late_initcall(fail_make_request_debugfs); #else /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool should_fail_request(struct hd_struct *part, unsigned int bytes) { return false; } #endif /* CONFIG_FAIL_MAKE_REQUEST */ static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) { const int op = bio_op(bio); if (part->policy && op_is_write(op)) { char b[BDEVNAME_SIZE]; if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) return false; WARN_ONCE(1, "Trying to write to read-only block-device %s (partno %d)\n", bio_devname(bio, b), part->partno); /* Older lvm-tools actually trigger this */ return false; } return false; } static noinline int should_fail_bio(struct bio *bio) { if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) return -EIO; return 0; } ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); /* * Check whether this bio extends beyond the end of the device or partition. * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) { unsigned int nr_sectors = bio_sectors(bio); if (nr_sectors && maxsector && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { handle_bad_sector(bio, maxsector); return -EIO; } return 0; } /* * Remap block n of partition p to block n+start(p) of the disk. */ static inline int blk_partition_remap(struct bio *bio) { struct hd_struct *p; int ret = -EIO; rcu_read_lock(); p = __disk_get_part(bio->bi_disk, bio->bi_partno); if (unlikely(!p)) goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) goto out; if (unlikely(bio_check_ro(bio, p))) goto out; if (bio_sectors(bio)) { if (bio_check_eod(bio, part_nr_sects_read(p))) goto out; bio->bi_iter.bi_sector += p->start_sect; trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), bio->bi_iter.bi_sector - p->start_sect); } bio->bi_partno = 0; ret = 0; out: rcu_read_unlock(); return ret; } /* * Check write append to a zoned block device. */ static inline blk_status_t blk_check_zone_append(struct request_queue *q, struct bio *bio) { sector_t pos = bio->bi_iter.bi_sector; int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ if (!blk_queue_is_zoned(q)) return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ if (pos & (blk_queue_zone_sectors(q) - 1) || !blk_queue_zone_is_seq(q, pos)) return BLK_STS_IOERR; /* * Not allowed to cross zone boundaries. Otherwise, the BIO will be * split and could result in non-contiguous sectors being written in * different zones. */ if (nr_sectors > q->limits.chunk_sectors) return BLK_STS_IOERR; /* Make sure the BIO is small enough and will not get split */ if (nr_sectors > q->limits.max_zone_append_sectors) return BLK_STS_IOERR; bio->bi_opf |= REQ_NOMERGE; return BLK_STS_OK; } static noinline_for_stack bool submit_bio_checks(struct bio *bio) { struct request_queue *q = bio->bi_disk->queue; blk_status_t status = BLK_STS_IOERR; struct blk_plug *plug; might_sleep(); plug = blk_mq_plug(q, bio); if (plug && plug->nowait) bio->bi_opf |= REQ_NOWAIT; /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. */ if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) goto not_supported; if (should_fail_bio(bio)) goto end_io; if (bio->bi_partno) { if (unlikely(blk_partition_remap(bio))) goto end_io; } else { if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) goto end_io; if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) goto end_io; } /* * Filter flush bio's early so that bio based drivers without flush * support don't have to worry about them. */ if (op_is_flush(bio->bi_opf) && !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); if (!bio_sectors(bio)) { status = BLK_STS_OK; goto end_io; } } if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) bio->bi_opf &= ~REQ_HIPRI; switch (bio_op(bio)) { case REQ_OP_DISCARD: if (!blk_queue_discard(q)) goto not_supported; break; case REQ_OP_SECURE_ERASE: if (!blk_queue_secure_erase(q)) goto not_supported; break; case REQ_OP_WRITE_SAME: if (!q->limits.max_write_same_sectors) goto not_supported; break; case REQ_OP_ZONE_APPEND: status = blk_check_zone_append(q, bio); if (status != BLK_STS_OK) goto end_io; break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: if (!blk_queue_is_zoned(q)) goto not_supported; break; case REQ_OP_ZONE_RESET_ALL: if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q)) goto not_supported; break; case REQ_OP_WRITE_ZEROES: if (!q->limits.max_write_zeroes_sectors) goto not_supported; break; default: break; } /* * Various block parts want %current->io_context, so allocate it up * front rather than dealing with lots of pain to allocate it only * where needed. This may fail and the block layer knows how to live * with it. */ if (unlikely(!current->io_context)) create_task_io_context(current, GFP_ATOMIC, q->node); if (blk_throtl_bio(bio)) return false; blk_cgroup_bio_start(bio); blkcg_bio_issue_init(bio); if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { trace_block_bio_queue(q, bio); /* Now that enqueuing has been traced, we need to trace * completion as well. */ bio_set_flag(bio, BIO_TRACE_COMPLETION); } return true; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); return false; } static blk_qc_t __submit_bio(struct bio *bio) { struct gendisk *disk = bio->bi_disk; blk_qc_t ret = BLK_QC_T_NONE; if (blk_crypto_bio_prep(&bio)) { if (!disk->fops->submit_bio) return blk_mq_submit_bio(bio); ret = disk->fops->submit_bio(bio); } blk_queue_exit(disk->queue); return ret; } /* * The loop in this function may be a bit non-obvious, and so deserves some * explanation: * * - Before entering the loop, bio->bi_next is NULL (as all callers ensure * that), so we have a list with a single bio. * - We pretend that we have just taken it off a longer list, so we assign * bio_list to a pointer to the bio_list_on_stack, thus initialising the * bio_list of new bios to be added. ->submit_bio() may indeed add some more * bios through a recursive call to submit_bio_noacct. If it did, we find a * non-NULL value in bio_list and re-enter the loop from the top. * - In this case we really did just take the bio of the top of the list (no * pretending) and so remove it from bio_list, and call into ->submit_bio() * again. * * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. * bio_list_on_stack[1] contains bios that were submitted before the current * ->submit_bio_bio, but that haven't been processed yet. */ static blk_qc_t __submit_bio_noacct(struct bio *bio) { struct bio_list bio_list_on_stack[2]; blk_qc_t ret = BLK_QC_T_NONE; BUG_ON(bio->bi_next); bio_list_init(&bio_list_on_stack[0]); current->bio_list = bio_list_on_stack; do { struct request_queue *q = bio->bi_disk->queue; struct bio_list lower, same; if (unlikely(bio_queue_enter(bio) != 0)) continue; /* * Create a fresh bio_list for all subordinate requests. */ bio_list_on_stack[1] = bio_list_on_stack[0]; bio_list_init(&bio_list_on_stack[0]); ret = __submit_bio(bio); /* * Sort new bios into those for a lower level and those for the * same level. */ bio_list_init(&lower); bio_list_init(&same); while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) if (q == bio->bi_disk->queue) bio_list_add(&same, bio); else bio_list_add(&lower, bio); /* * Now assemble so we handle the lowest level first. */ bio_list_merge(&bio_list_on_stack[0], &lower); bio_list_merge(&bio_list_on_stack[0], &same); bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); current->bio_list = NULL; return ret; } static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) { struct bio_list bio_list[2] = { }; blk_qc_t ret = BLK_QC_T_NONE; current->bio_list = bio_list; do { struct gendisk *disk = bio->bi_disk; if (unlikely(bio_queue_enter(bio) != 0)) continue; if (!blk_crypto_bio_prep(&bio)) { blk_queue_exit(disk->queue); ret = BLK_QC_T_NONE; continue; } ret = blk_mq_submit_bio(bio); } while ((bio = bio_list_pop(&bio_list[0]))); current->bio_list = NULL; return ret; } /** * submit_bio_noacct - re-submit a bio to the block device layer for I/O * @bio: The bio describing the location in memory and on the device. * * This is a version of submit_bio() that shall only be used for I/O that is * resubmitted to lower level drivers by stacking block drivers. All file * systems and other upper level users of the block layer should use * submit_bio() instead. */ blk_qc_t submit_bio_noacct(struct bio *bio) { if (!submit_bio_checks(bio)) return BLK_QC_T_NONE; /* * We only want one ->submit_bio to be active at a time, else stack * usage with stacked devices could be a problem. Use current->bio_list * to collect a list of requests submited by a ->submit_bio method while * it is active, and then process them after it returned. */ if (current->bio_list) { bio_list_add(&current->bio_list[0], bio); return BLK_QC_T_NONE; } if (!bio->bi_disk->fops->submit_bio) return __submit_bio_noacct_mq(bio); return __submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio_noacct); /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The * bio will be send to the device described by the bi_disk and bi_partno fields. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has * been called. */ blk_qc_t submit_bio(struct bio *bio) { if (blkcg_punt_bio_submit(bio)) return BLK_QC_T_NONE; /* * If it's a regular read/write or a barrier with data attached, * go through the normal accounting stuff before submission. */ if (bio_has_data(bio)) { unsigned int count; if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) count = queue_logical_block_size(bio->bi_disk->queue) >> 9; else count = bio_sectors(bio); if (op_is_write(bio_op(bio))) { count_vm_events(PGPGOUT, count); } else { task_io_account_read(bio->bi_iter.bi_size); count_vm_events(PGPGIN, count); } if (unlikely(block_dump)) { char b[BDEVNAME_SIZE]; printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", current->comm, task_pid_nr(current), op_is_write(bio_op(bio)) ? "WRITE" : "READ", (unsigned long long)bio->bi_iter.bi_sector, bio_devname(bio, b), count); } } /* * If we're reading data that is part of the userspace workingset, count * submission time as memory stall. When the device is congested, or * the submitting cgroup IO-throttled, submission can be a significant * part of overall IO time. */ if (unlikely(bio_op(bio) == REQ_OP_READ && bio_flagged(bio, BIO_WORKINGSET))) { unsigned long pflags; blk_qc_t ret; psi_memstall_enter(&pflags); ret = submit_bio_noacct(bio); psi_memstall_leave(&pflags); return ret; } return submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for the new queue limits * @q: the queue * @rq: the request being checked * * Description: * @rq may have been made based on weaker limitations of upper-level queues * in request stacking drivers, and it may violate the limitation of @q. * Since the block layer and the underlying device driver trust @rq * after it is inserted to @q, it should be checked against @q before * the insertion using this generic function. * * Request stacking drivers like request-based dm may change the queue * limits when retrying requests on other queues. Those requests need * to be checked against the new queue limits again during dispatch. */ static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, struct request *rq) { unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); if (blk_rq_sectors(rq) > max_sectors) { /* * SCSI device does not have a good way to return if * Write Same/Zero is actually supported. If a device rejects * a non-read/write command (discard, write same,etc.) the * low-level device driver will set the relevant queue limit to * 0 to prevent blk-lib from issuing more of the offending * operations. Commands queued prior to the queue limit being * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O * errors being propagated to upper layers. */ if (max_sectors == 0) return BLK_STS_NOTSUPP; printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", __func__, blk_rq_sectors(rq), max_sectors); return BLK_STS_IOERR; } /* * queue's settings related to segment counting like q->bounce_pfn * may differ from that of other stacking queues. * Recalculate it to check the request correctly on this queue's * limitation. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); if (rq->nr_phys_segments > queue_max_segments(q)) { printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", __func__, rq->nr_phys_segments, queue_max_segments(q)); return BLK_STS_IOERR; } return BLK_STS_OK; } /** * blk_insert_cloned_request - Helper for stacking drivers to submit a request * @q: the queue to submit the request * @rq: the request being queued */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { blk_status_t ret; ret = blk_cloned_rq_check_limits(q, rq); if (ret != BLK_STS_OK) return ret; if (rq->rq_disk && should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; if (blk_crypto_insert_cloned_request(rq)) return BLK_STS_IOERR; if (blk_queue_io_stat(q)) blk_account_io_start(rq); /* * Since we have a scheduler attached on the top device, * bypass a potential scheduler on the bottom device for * insert. */ return blk_mq_request_issue_directly(rq, true); } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); /** * blk_rq_err_bytes - determine number of bytes till the next failure boundary * @rq: request to examine * * Description: * A request could be merge of IOs which require different failure * handling. This function determines the number of bytes which * can be failed from the beginning of the request without * crossing into area which need to be retried further. * * Return: * The number of bytes to fail. */ unsigned int blk_rq_err_bytes(const struct request *rq) { unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; unsigned int bytes = 0; struct bio *bio; if (!(rq->rq_flags & RQF_MIXED_MERGE)) return blk_rq_bytes(rq); /* * Currently the only 'mixing' which can happen is between * different fastfail types. We can safely fail portions * which have all the failfast bits that the first one has - * the ones which are at least as eager to fail as the first * one. */ for (bio = rq->bio; bio; bio = bio->bi_next) { if ((bio->bi_opf & ff) != ff) break; bytes += bio->bi_iter.bi_size; } /* this could lead to infinite loop */ BUG_ON(blk_rq_bytes(rq) && !bytes); return bytes; } EXPORT_SYMBOL_GPL(blk_rq_err_bytes); static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) { unsigned long stamp; again: stamp = READ_ONCE(part->stamp); if (unlikely(stamp != now)) { if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } if (part->partno) { part = &part_to_disk(part)->part0; goto again; } } static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (req->part && blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; part_stat_lock(); part = req->part; part_stat_add(part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } void blk_account_io_done(struct request *req, u64 now) { /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the * containing request is enough. */ if (req->part && blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; part_stat_lock(); part = req->part; update_io_ticks(part, jiffies, true); part_stat_inc(part, ios[sgrp]); part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); part_stat_unlock(); hd_struct_put(part); } } void blk_account_io_start(struct request *rq) { if (!blk_do_io_stat(rq)) return; rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); part_stat_lock(); update_io_ticks(rq->part, jiffies, false); part_stat_unlock(); } static unsigned long __part_start_io_acct(struct hd_struct *part, unsigned int sectors, unsigned int op) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); part_stat_lock(); update_io_ticks(part, now, false); part_stat_inc(part, ios[sgrp]); part_stat_add(part, sectors[sgrp], sectors); part_stat_local_inc(part, in_flight[op_is_write(op)]); part_stat_unlock(); return now; } unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, struct bio *bio) { *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); } EXPORT_SYMBOL_GPL(part_start_io_acct); unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, unsigned int op) { return __part_start_io_acct(&disk->part0, sectors, op); } EXPORT_SYMBOL(disk_start_io_acct); static void __part_end_io_acct(struct hd_struct *part, unsigned int op, unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); update_io_ticks(part, now, true); part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(part, in_flight[op_is_write(op)]); part_stat_unlock(); } void part_end_io_acct(struct hd_struct *part, struct bio *bio, unsigned long start_time) { __part_end_io_acct(part, bio_op(bio), start_time); hd_struct_put(part); } EXPORT_SYMBOL_GPL(part_end_io_acct); void disk_end_io_acct(struct gendisk *disk, unsigned int op, unsigned long start_time) { __part_end_io_acct(&disk->part0, op, start_time); } EXPORT_SYMBOL(disk_end_io_acct); /* * Steal bios from a request and add them to a bio list. * The request must not have been partially completed before. */ void blk_steal_bios(struct bio_list *list, struct request *rq) { if (rq->bio) { if (list->tail) list->tail->bi_next = rq->bio; else list->head = rq->bio; list->tail = rq->biotail; rq->bio = NULL; rq->biotail = NULL; } rq->__data_len = 0; } EXPORT_SYMBOL_GPL(blk_steal_bios); /** * blk_update_request - Special helper function for request stacking drivers * @req: the request being processed * @error: block status code * @nr_bytes: number of bytes to complete @req * * Description: * Ends I/O on a number of bytes attached to @req, but doesn't complete * the request structure even if @req doesn't have leftover. * If @req has leftover, sets it up for the next range of segments. * * This special helper function is only for request stacking drivers * (e.g. request-based dm) so that they can handle partial completion. * Actual device drivers should use blk_mq_end_request instead. * * Passing the result of blk_rq_bytes() as @nr_bytes guarantees * %false return from this function. * * Note: * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both * blk_rq_bytes() and in blk_update_request(). * * Return: * %false - this request doesn't have any more data * %true - this request has more data **/ bool blk_update_request(struct request *req, blk_status_t error, unsigned int nr_bytes) { int total_bytes; trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); if (!req->bio) return false; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && error == BLK_STS_OK) req->q->integrity.profile->complete_fn(req, nr_bytes); #endif if (unlikely(error && !blk_rq_is_passthrough(req) && !(req->rq_flags & RQF_QUIET))) print_req_error(req, error, __func__); blk_account_io_completion(req, nr_bytes); total_bytes = 0; while (req->bio) { struct bio *bio = req->bio; unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); if (bio_bytes == bio->bi_iter.bi_size) req->bio = bio->bi_next; /* Completion has already been traced */ bio_clear_flag(bio, BIO_TRACE_COMPLETION); req_bio_endio(req, bio, bio_bytes, error); total_bytes += bio_bytes; nr_bytes -= bio_bytes; if (!nr_bytes) break; } /* * completely done */ if (!req->bio) { /* * Reset counters so that the request stacking driver * can find how many bytes remain in the request * later. */ req->__data_len = 0; return false; } req->__data_len -= total_bytes; /* update sector only for requests with clear definition of sector */ if (!blk_rq_is_passthrough(req)) req->__sector += total_bytes >> 9; /* mixed attributes always follow the first bio */ if (req->rq_flags & RQF_MIXED_MERGE) { req->cmd_flags &= ~REQ_FAILFAST_MASK; req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; } if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { /* * If total number of sectors is less than the first segment * size, something has gone terribly wrong. */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { blk_dump_rq_flags(req, "request botched"); req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ req->nr_phys_segments = blk_recalc_rq_segments(req); } return true; } EXPORT_SYMBOL_GPL(blk_update_request); #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE /** * rq_flush_dcache_pages - Helper function to flush all pages in a request * @rq: the request to be flushed * * Description: * Flush all pages in @rq. */ void rq_flush_dcache_pages(struct request *rq) { struct req_iterator iter; struct bio_vec bvec; rq_for_each_segment(bvec, rq, iter) flush_dcache_page(bvec.bv_page); } EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); #endif /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy * @q : the queue of the device being checked * * Description: * Check if underlying low-level drivers of a device are busy. * If the drivers want to export their busy state, they must set own * exporting function using blk_queue_lld_busy() first. * * Basically, this function is used only by request stacking drivers * to stop dispatching requests to underlying devices when underlying * devices are busy. This behavior helps more I/O merging on the queue * of the request stacking driver and prevents I/O throughput regression * on burst I/O load. * * Return: * 0 - Not busy (The request stacking driver should dispatch request) * 1 - Busy (The request stacking driver should stop dispatching request) */ int blk_lld_busy(struct request_queue *q) { if (queue_is_mq(q) && q->mq_ops->busy) return q->mq_ops->busy(q); return 0; } EXPORT_SYMBOL_GPL(blk_lld_busy); /** * blk_rq_unprep_clone - Helper function to free all bios in a cloned request * @rq: the clone request to be cleaned up * * Description: * Free all bios in @rq for a cloned request. */ void blk_rq_unprep_clone(struct request *rq) { struct bio *bio; while ((bio = rq->bio) != NULL) { rq->bio = bio->bi_next; bio_put(bio); } } EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); /** * blk_rq_prep_clone - Helper function to setup clone request * @rq: the request to be setup * @rq_src: original request to be cloned * @bs: bio_set that bios for clone are allocated from * @gfp_mask: memory allocation mask for bio * @bio_ctr: setup function to be called for each clone bio. * Returns %0 for success, non %0 for failure. * @data: private data to be passed to @bio_ctr * * Description: * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. * Also, pages which the original bios are pointing to are not copied * and the cloned bios just point same pages. * So cloned bios must be completed before original bios, which means * the caller must complete @rq before @rq_src. */ int blk_rq_prep_clone(struct request *rq, struct request *rq_src, struct bio_set *bs, gfp_t gfp_mask, int (*bio_ctr)(struct bio *, struct bio *, void *), void *data) { struct bio *bio, *bio_src; if (!bs) bs = &fs_bio_set; __rq_for_each_bio(bio_src, rq_src) { bio = bio_clone_fast(bio_src, gfp_mask, bs); if (!bio) goto free_and_out; if (bio_ctr && bio_ctr(bio, bio_src, data)) goto free_and_out; if (rq->bio) { rq->biotail->bi_next = bio; rq->biotail = bio; } else { rq->bio = rq->biotail = bio; } bio = NULL; } /* Copy attributes of the original request to the clone request. */ rq->__sector = blk_rq_pos(rq_src); rq->__data_len = blk_rq_bytes(rq_src); if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { rq->rq_flags |= RQF_SPECIAL_PAYLOAD; rq->special_vec = rq_src->special_vec; } rq->nr_phys_segments = rq_src->nr_phys_segments; rq->ioprio = rq_src->ioprio; if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) goto free_and_out; return 0; free_and_out: if (bio) bio_put(bio); blk_rq_unprep_clone(rq); return -ENOMEM; } EXPORT_SYMBOL_GPL(blk_rq_prep_clone); int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); } EXPORT_SYMBOL(kblockd_schedule_work); int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay) { return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized * * Description: * blk_start_plug() indicates to the block layer an intent by the caller * to submit multiple I/O requests in a batch. The block layer may use * this hint to defer submitting I/Os from the caller until blk_finish_plug() * is called. However, the block layer may choose to submit requests * before a call to blk_finish_plug() if the number of queued I/Os * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if * the task schedules (see below). * * Tracking blk_plug inside the task_struct will help with auto-flushing the * pending I/O should the task end up blocking between blk_start_plug() and * blk_finish_plug(). This is important from a performance perspective, but * also ensures that we don't deadlock. For instance, if the task is blocking * for a memory allocation, memory reclaim could end up wanting to free a * page belonging to that request that is currently residing in our private * plug. By flushing the pending I/O when the process goes to sleep, we avoid * this kind of deadlock. */ void blk_start_plug(struct blk_plug *plug) { struct task_struct *tsk = current; /* * If this is a nested plug, don't actually assign it. */ if (tsk->plug) return; INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); plug->rq_count = 0; plug->multiple_queues = false; plug->nowait = false; /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier */ tsk->plug = plug; } EXPORT_SYMBOL(blk_start_plug); static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); while (!list_empty(&plug->cb_list)) { list_splice_init(&plug->cb_list, &callbacks); while (!list_empty(&callbacks)) { struct blk_plug_cb *cb = list_first_entry(&callbacks, struct blk_plug_cb, list); list_del(&cb->list); cb->callback(cb, from_schedule); } } } struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, int size) { struct blk_plug *plug = current->plug; struct blk_plug_cb *cb; if (!plug) return NULL; list_for_each_entry(cb, &plug->cb_list, list) if (cb->callback == unplug && cb->data == data) return cb; /* Not currently on the callback list */ BUG_ON(size < sizeof(*cb)); cb = kzalloc(size, GFP_ATOMIC); if (cb) { cb->data = data; cb->callback = unplug; list_add(&cb->list, &plug->cb_list); } return cb; } EXPORT_SYMBOL(blk_check_plugged); void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); } /** * blk_finish_plug - mark the end of a batch of submitted I/O * @plug: The &struct blk_plug passed to blk_start_plug() * * Description: * Indicate that a batch of I/O submissions is complete. This function * must be paired with an initial call to blk_start_plug(). The intent * is to allow the block layer to optimize I/O submission. See the * documentation for blk_start_plug() for more information. */ void blk_finish_plug(struct blk_plug *plug) { if (plug != current->plug) return; blk_flush_plug_list(plug, false); current->plug = NULL; } EXPORT_SYMBOL(blk_finish_plug); void blk_io_schedule(void) { /* Prevent hang_check timer from firing at us during very long I/O */ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; if (timeout) io_schedule_timeout(timeout); else io_schedule(); } EXPORT_SYMBOL_GPL(blk_io_schedule); int __init blk_dev_init(void) { BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct bio, bi_opf)); /* used for unplugging and affects IO latency/throughput - HIGHPRI */ kblockd_workqueue = alloc_workqueue("kblockd", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); blk_debugfs_root = debugfs_create_dir("block", NULL); return 0; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP module. * * Version: @(#)tcp.h 1.0.5 05/23/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _TCP_H #define _TCP_H #define FASTRETRANS_DEBUG 1 #include <linux/list.h> #include <linux/tcp.h> #include <linux/bug.h> #include <linux/slab.h> #include <linux/cache.h> #include <linux/percpu.h> #include <linux/skbuff.h> #include <linux/kref.h> #include <linux/ktime.h> #include <linux/indirect_call_wrapper.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <net/inet_hashtables.h> #include <net/checksum.h> #include <net/request_sock.h> #include <net/sock_reuseport.h> #include <net/sock.h> #include <net/snmp.h> #include <net/ip.h> #include <net/tcp_states.h> #include <net/inet_ecn.h> #include <net/dst.h> #include <net/mptcp.h> #include <linux/seq_file.h> #include <linux/memcontrol.h> #include <linux/bpf-cgroup.h> #include <linux/siphash.h> extern struct inet_hashinfo tcp_hashinfo; DECLARE_PER_CPU(unsigned int, tcp_orphan_count); int tcp_orphan_count_sum(void); void tcp_time_wait(struct sock *sk, int state, int timeo); #define MAX_TCP_HEADER L1_CACHE_ALIGN(128 + MAX_HEADER) #define MAX_TCP_OPTION_SPACE 40 #define TCP_MIN_SND_MSS 48 #define TCP_MIN_GSO_SIZE (TCP_MIN_SND_MSS - MAX_TCP_OPTION_SPACE) /* * Never offer a window over 32767 without using window scaling. Some * poor stacks do signed 16bit maths! */ #define MAX_TCP_WINDOW 32767U /* Minimal accepted MSS. It is (60+60+8) - (20+20). */ #define TCP_MIN_MSS 88U /* The initial MTU to use for probing */ #define TCP_BASE_MSS 1024 /* probing interval, default to 10 minutes as per RFC4821 */ #define TCP_PROBE_INTERVAL 600 /* Specify interval when tcp mtu probing will stop */ #define TCP_PROBE_THRESHOLD 8 /* After receiving this amount of duplicate ACKs fast retransmit starts. */ #define TCP_FASTRETRANS_THRESH 3 /* Maximal number of ACKs sent quickly to accelerate slow-start. */ #define TCP_MAX_QUICKACKS 16U /* Maximal number of window scale according to RFC1323 */ #define TCP_MAX_WSCALE 14U /* urg_data states */ #define TCP_URG_VALID 0x0100 #define TCP_URG_NOTYET 0x0200 #define TCP_URG_READ 0x0400 #define TCP_RETR1 3 /* * This is how many retries it does before it * tries to figure out if the gateway is * down. Minimal RFC value is 3; it corresponds * to ~3sec-8min depending on RTO. */ #define TCP_RETR2 15 /* * This should take at least * 90 minutes to time out. * RFC1122 says that the limit is 100 sec. * 15 is ~13-30min depending on RTO. */ #define TCP_SYN_RETRIES 6 /* This is how many retries are done * when active opening a connection. * RFC1122 says the minimum retry MUST * be at least 180secs. Nevertheless * this value is corresponding to * 63secs of retransmission with the * current initial RTO. */ #define TCP_SYNACK_RETRIES 5 /* This is how may retries are done * when passive opening a connection. * This is corresponding to 31secs of * retransmission with the current * initial RTO. */ #define TCP_TIMEWAIT_LEN (60*HZ) /* how long to wait to destroy TIME-WAIT * state, about 60 seconds */ #define TCP_FIN_TIMEOUT TCP_TIMEWAIT_LEN /* BSD style FIN_WAIT2 deadlock breaker. * It used to be 3min, new value is 60sec, * to combine FIN-WAIT-2 timeout with * TIME-WAIT timer. */ #define TCP_FIN_TIMEOUT_MAX (120 * HZ) /* max TCP_LINGER2 value (two minutes) */ #define TCP_DELACK_MAX ((unsigned)(HZ/5)) /* maximal time to delay before sending an ACK */ #if HZ >= 100 #define TCP_DELACK_MIN ((unsigned)(HZ/25)) /* minimal time to delay before sending an ACK */ #define TCP_ATO_MIN ((unsigned)(HZ/25)) #else #define TCP_DELACK_MIN 4U #define TCP_ATO_MIN 4U #endif #define TCP_RTO_MAX ((unsigned)(120*HZ)) #define TCP_RTO_MIN ((unsigned)(HZ/5)) #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now * used as a fallback RTO for the * initial data transmission if no * valid RTT sample has been acquired, * most likely due to retrans in 3WHS. */ #define TCP_RESOURCE_PROBE_INTERVAL ((unsigned)(HZ/2U)) /* Maximal interval between probes * for local resources. */ #define TCP_KEEPALIVE_TIME (120*60*HZ) /* two hours */ #define TCP_KEEPALIVE_PROBES 9 /* Max of 9 keepalive probes */ #define TCP_KEEPALIVE_INTVL (75*HZ) #define MAX_TCP_KEEPIDLE 32767 #define MAX_TCP_KEEPINTVL 32767 #define MAX_TCP_KEEPCNT 127 #define MAX_TCP_SYNCNT 127 #define TCP_SYNQ_INTERVAL (HZ/5) /* Period of SYNACK timer */ #define TCP_PAWS_24DAYS (60 * 60 * 24 * 24) #define TCP_PAWS_MSL 60 /* Per-host timestamps are invalidated * after this time. It should be equal * (or greater than) TCP_TIMEWAIT_LEN * to provide reliability equal to one * provided by timewait state. */ #define TCP_PAWS_WINDOW 1 /* Replay window for per-host * timestamps. It must be less than * minimal timewait lifetime. */ /* * TCP option */ #define TCPOPT_NOP 1 /* Padding */ #define TCPOPT_EOL 0 /* End of options */ #define TCPOPT_MSS 2 /* Segment size negotiating */ #define TCPOPT_WINDOW 3 /* Window scaling */ #define TCPOPT_SACK_PERM 4 /* SACK Permitted */ #define TCPOPT_SACK 5 /* SACK Block */ #define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ #define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ #define TCPOPT_MPTCP 30 /* Multipath TCP (RFC6824) */ #define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ #define TCPOPT_EXP 254 /* Experimental */ /* Magic number to be after the option value for sharing TCP * experimental options. See draft-ietf-tcpm-experimental-options-00.txt */ #define TCPOPT_FASTOPEN_MAGIC 0xF989 #define TCPOPT_SMC_MAGIC 0xE2D4C3D9 /* * TCP option lengths */ #define TCPOLEN_MSS 4 #define TCPOLEN_WINDOW 3 #define TCPOLEN_SACK_PERM 2 #define TCPOLEN_TIMESTAMP 10 #define TCPOLEN_MD5SIG 18 #define TCPOLEN_FASTOPEN_BASE 2 #define TCPOLEN_EXP_FASTOPEN_BASE 4 #define TCPOLEN_EXP_SMC_BASE 6 /* But this is what stacks really send out. */ #define TCPOLEN_TSTAMP_ALIGNED 12 #define TCPOLEN_WSCALE_ALIGNED 4 #define TCPOLEN_SACKPERM_ALIGNED 4 #define TCPOLEN_SACK_BASE 2 #define TCPOLEN_SACK_BASE_ALIGNED 4 #define TCPOLEN_SACK_PERBLOCK 8 #define TCPOLEN_MD5SIG_ALIGNED 20 #define TCPOLEN_MSS_ALIGNED 4 #define TCPOLEN_EXP_SMC_BASE_ALIGNED 8 /* Flags in tp->nonagle */ #define TCP_NAGLE_OFF 1 /* Nagle's algo is disabled */ #define TCP_NAGLE_CORK 2 /* Socket is corked */ #define TCP_NAGLE_PUSH 4 /* Cork is overridden for already queued data */ /* TCP thin-stream limits */ #define TCP_THIN_LINEAR_RETRIES 6 /* After 6 linear retries, do exp. backoff */ /* TCP initial congestion window as per rfc6928 */ #define TCP_INIT_CWND 10 /* Bit Flags for sysctl_tcp_fastopen */ #define TFO_CLIENT_ENABLE 1 #define TFO_SERVER_ENABLE 2 #define TFO_CLIENT_NO_COOKIE 4 /* Data in SYN w/o cookie option */ /* Accept SYN data w/o any cookie option */ #define TFO_SERVER_COOKIE_NOT_REQD 0x200 /* Force enable TFO on all listeners, i.e., not requiring the * TCP_FASTOPEN socket option. */ #define TFO_SERVER_WO_SOCKOPT1 0x400 /* sysctl variables for tcp */ extern int sysctl_tcp_max_orphans; extern long sysctl_tcp_mem[3]; #define TCP_RACK_LOSS_DETECTION 0x1 /* Use RACK to detect losses */ #define TCP_RACK_STATIC_REO_WND 0x2 /* Use static RACK reo wnd */ #define TCP_RACK_NO_DUPTHRESH 0x4 /* Do not use DUPACK threshold in RACK */ extern atomic_long_t tcp_memory_allocated; extern struct percpu_counter tcp_sockets_allocated; extern unsigned long tcp_memory_pressure; /* optimized version of sk_under_memory_pressure() for TCP sockets */ static inline bool tcp_under_memory_pressure(const struct sock *sk) { if (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return READ_ONCE(tcp_memory_pressure); } /* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). */ static inline bool before(__u32 seq1, __u32 seq2) { return (__s32)(seq1-seq2) < 0; } #define after(seq2, seq1) before(seq1, seq2) /* is s2<=s1<=s3 ? */ static inline bool between(__u32 seq1, __u32 seq2, __u32 seq3) { return seq3 - seq2 >= seq1 - seq2; } static inline bool tcp_out_of_memory(struct sock *sk) { if (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && sk_memory_allocated(sk) > sk_prot_mem_limits(sk, 2)) return true; return false; } void sk_forced_mem_schedule(struct sock *sk, int size); bool tcp_check_oom(struct sock *sk, int shift); extern struct proto tcp_prot; #define TCP_INC_STATS(net, field) SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define __TCP_INC_STATS(net, field) __SNMP_INC_STATS((net)->mib.tcp_statistics, field) #define TCP_DEC_STATS(net, field) SNMP_DEC_STATS((net)->mib.tcp_statistics, field) #define TCP_ADD_STATS(net, field, val) SNMP_ADD_STATS((net)->mib.tcp_statistics, field, val) void tcp_tasklet_init(void); int tcp_v4_err(struct sk_buff *skb, u32); void tcp_shutdown(struct sock *sk, int how); int tcp_v4_early_demux(struct sk_buff *skb); int tcp_v4_rcv(struct sk_buff *skb); int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw); int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size); int tcp_sendpage(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset, size_t size, int flags); int tcp_send_mss(struct sock *sk, int *size_goal, int flags); void tcp_push(struct sock *sk, int flags, int mss_now, int nonagle, int size_goal); void tcp_release_cb(struct sock *sk); void tcp_wfree(struct sk_buff *skb); void tcp_write_timer_handler(struct sock *sk); void tcp_delack_timer_handler(struct sock *sk); int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg); int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb); void tcp_rcv_established(struct sock *sk, struct sk_buff *skb); void tcp_rcv_space_adjust(struct sock *sk); int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp); void tcp_twsk_destructor(struct sock *sk); ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); void tcp_enter_quickack_mode(struct sock *sk, unsigned int max_quickacks); static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ack.quick) { if (pkts >= icsk->icsk_ack.quick) { icsk->icsk_ack.quick = 0; /* Leaving quickack mode we deflate ATO. */ icsk->icsk_ack.ato = TCP_ATO_MIN; } else icsk->icsk_ack.quick -= pkts; } } #define TCP_ECN_OK 1 #define TCP_ECN_QUEUE_CWR 2 #define TCP_ECN_DEMAND_CWR 4 #define TCP_ECN_SEEN 8 enum tcp_tw_status { TCP_TW_SUCCESS = 0, TCP_TW_RST = 1, TCP_TW_ACK = 2, TCP_TW_SYN = 3 }; enum tcp_tw_status tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb, const struct tcphdr *th); struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, struct request_sock *req, bool fastopen, bool *lost_race); int tcp_child_process(struct sock *parent, struct sock *child, struct sk_buff *skb); void tcp_enter_loss(struct sock *sk); void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked, int flag); void tcp_clear_retrans(struct tcp_sock *tp); void tcp_update_metrics(struct sock *sk); void tcp_init_metrics(struct sock *sk); void tcp_metrics_init(void); bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); void tcp_close(struct sock *sk, long timeout); void tcp_init_sock(struct sock *sk); void tcp_init_transfer(struct sock *sk, int bpf_op, struct sk_buff *skb); __poll_t tcp_poll(struct file *file, struct socket *sock, struct poll_table_struct *wait); int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); void tcp_set_keepalive(struct sock *sk, int val); void tcp_syn_ack_timeout(const struct request_sock *req); int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); int tcp_set_rcvlowat(struct sock *sk, int val); void tcp_data_ready(struct sock *sk); #ifdef CONFIG_MMU int tcp_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); #endif void tcp_parse_options(const struct net *net, const struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab, struct tcp_fastopen_cookie *foc); const u8 *tcp_parse_md5sig_option(const struct tcphdr *th); /* * BPF SKB-less helpers */ u16 tcp_v4_get_syncookie(struct sock *sk, struct iphdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_v6_get_syncookie(struct sock *sk, struct ipv6hdr *iph, struct tcphdr *th, u32 *cookie); u16 tcp_get_syncookie_mss(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct tcphdr *th); /* * TCP v4 functions exported for the inet6 API */ void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb); void tcp_v4_mtu_reduced(struct sock *sk); void tcp_req_err(struct sock *sk, u32 seq, bool abort); void tcp_ld_RTO_revert(struct sock *sk, u32 seq); int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb); struct sock *tcp_create_openreq_child(const struct sock *sk, struct request_sock *req, struct sk_buff *skb); void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst); struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, struct request_sock *req_unhash, bool *own_req); int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb); int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); int tcp_connect(struct sock *sk); enum tcp_synack_type { TCP_SYNACK_NORMAL, TCP_SYNACK_FASTOPEN, TCP_SYNACK_COOKIE, }; struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); int tcp_disconnect(struct sock *sk, int flags); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size); void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb); /* From syncookies.c */ struct sock *tcp_get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst, u32 tsoff); int __cookie_v4_check(const struct iphdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb); struct request_sock *cookie_tcp_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES /* Syncookies use a monotonic timer which increments every 60 seconds. * This counter is used both as a hash input and partially encoded into * the cookie value. A cookie is only validated further if the delta * between the current counter value and the encoded one is less than this, * i.e. a sent cookie is valid only at most for 2*60 seconds (or less if * the counter advances immediately after a cookie is generated). */ #define MAX_SYNCOOKIE_AGE 2 #define TCP_SYNCOOKIE_PERIOD (60 * HZ) #define TCP_SYNCOOKIE_VALID (MAX_SYNCOOKIE_AGE * TCP_SYNCOOKIE_PERIOD) /* syncookies: remember time of last synqueue overflow * But do not dirty this field too often (once per second is enough) * It is racy as we do not hold a lock, but race is very minor. */ static inline void tcp_synq_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(reuse->synq_overflow_ts, now); return; } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); if (!time_between32(now, last_overflow, last_overflow + HZ)) WRITE_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp, now); } /* syncookies: no recent synqueue overflow on this listening socket? */ static inline bool tcp_synq_no_recent_overflow(const struct sock *sk) { unsigned int last_overflow; unsigned int now = jiffies; if (sk->sk_reuseport) { struct sock_reuseport *reuse; reuse = rcu_dereference(sk->sk_reuseport_cb); if (likely(reuse)) { last_overflow = READ_ONCE(reuse->synq_overflow_ts); return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } } last_overflow = READ_ONCE(tcp_sk(sk)->rx_opt.ts_recent_stamp); /* If last_overflow <= jiffies <= last_overflow + TCP_SYNCOOKIE_VALID, * then we're under synflood. However, we have to use * 'last_overflow - HZ' as lower bound. That's because a concurrent * tcp_synq_overflow() could update .ts_recent_stamp after we read * jiffies but before we store .ts_recent_stamp into last_overflow, * which could lead to rejecting a valid syncookie. */ return !time_between32(now, last_overflow - HZ, last_overflow + TCP_SYNCOOKIE_VALID); } static inline u32 tcp_cookie_time(void) { u64 val = get_jiffies_64(); do_div(val, TCP_SYNCOOKIE_PERIOD); return val; } u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v4_init_sequence(const struct sk_buff *skb, __u16 *mss); u64 cookie_init_timestamp(struct request_sock *req, u64 now); bool cookie_timestamp_decode(const struct net *net, struct tcp_options_received *opt); bool cookie_ecn_ok(const struct tcp_options_received *opt, const struct net *net, const struct dst_entry *dst); /* From net/ipv6/syncookies.c */ int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, u32 cookie); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, const struct tcphdr *th, u16 *mssp); __u32 cookie_v6_init_sequence(const struct sk_buff *skb, __u16 *mss); #endif /* tcp_output.c */ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, int nonagle); int __tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs); void tcp_retransmit_timer(struct sock *sk); void tcp_xmit_retransmit_queue(struct sock *); void tcp_simple_retransmit(struct sock *); void tcp_enter_recovery(struct sock *sk, bool ece_ack); int tcp_trim_head(struct sock *, struct sk_buff *, u32); enum tcp_queue { TCP_FRAG_IN_WRITE_QUEUE, TCP_FRAG_IN_RTX_QUEUE, }; int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, struct sk_buff *skb, u32 len, unsigned int mss_now, gfp_t gfp); void tcp_send_probe0(struct sock *); void tcp_send_partial(struct sock *); int tcp_write_wakeup(struct sock *, int mib); void tcp_send_fin(struct sock *sk); void tcp_send_active_reset(struct sock *sk, gfp_t priority); int tcp_send_synack(struct sock *); void tcp_push_one(struct sock *, unsigned int mss_now); void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); void tcp_send_ack(struct sock *sk); void tcp_send_delayed_ack(struct sock *sk); void tcp_send_loss_probe(struct sock *sk); bool tcp_schedule_loss_probe(struct sock *sk, bool advancing_rto); void tcp_skb_collapse_tstamp(struct sk_buff *skb, const struct sk_buff *next_skb); /* tcp_input.c */ void tcp_rearm_rto(struct sock *sk); void tcp_synack_rtt_meas(struct sock *sk, struct request_sock *req); void tcp_reset(struct sock *sk); void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, struct sk_buff *skb); void tcp_fin(struct sock *sk); /* tcp_timer.c */ void tcp_init_xmit_timers(struct sock *); static inline void tcp_clear_xmit_timers(struct sock *sk) { if (hrtimer_try_to_cancel(&tcp_sk(sk)->pacing_timer) == 1) __sock_put(sk); if (hrtimer_try_to_cancel(&tcp_sk(sk)->compressed_ack_timer) == 1) __sock_put(sk); inet_csk_clear_xmit_timers(sk); } unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu); unsigned int tcp_current_mss(struct sock *sk); u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when); /* Bound MSS / TSO packet size with the half of the window */ static inline int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) { int cutoff; /* When peer uses tiny windows, there is no use in packetizing * to sub-MSS pieces for the sake of SWS or making sure there * are enough packets in the pipe for fast recovery. * * On the other hand, for extremely large MSS devices, handling * smaller than MSS windows in this way does make sense. */ if (tp->max_window > TCP_MSS_DEFAULT) cutoff = (tp->max_window >> 1); else cutoff = tp->max_window; if (cutoff && pktsize > cutoff) return max_t(int, cutoff, 68U - tp->tcp_header_len); else return pktsize; } /* tcp.c */ void tcp_get_info(struct sock *, struct tcp_info *); /* Read 'sendfile()'-style from a TCP socket */ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); void tcp_initialize_rcv_mss(struct sock *sk); int tcp_mtu_to_mss(struct sock *sk, int pmtu); int tcp_mss_to_mtu(struct sock *sk, int mss); void tcp_mtup_init(struct sock *sk); static inline void tcp_bound_rto(const struct sock *sk) { if (inet_csk(sk)->icsk_rto > TCP_RTO_MAX) inet_csk(sk)->icsk_rto = TCP_RTO_MAX; } static inline u32 __tcp_set_rto(const struct tcp_sock *tp) { return usecs_to_jiffies((tp->srtt_us >> 3) + tp->rttvar_us); } static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { /* mptcp hooks are only on the slow path */ if (sk_is_mptcp((struct sock *)tp)) return; tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } static inline void tcp_fast_path_check(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); if (RB_EMPTY_ROOT(&tp->out_of_order_queue) && tp->rcv_wnd && atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf && !tp->urg_data) tcp_fast_path_on(tp); } /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(struct sock *sk) { const struct dst_entry *dst = __sk_dst_get(sk); u32 rto_min = inet_csk(sk)->icsk_rto_min; if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) rto_min = dst_metric_rtt(dst, RTAX_RTO_MIN); return rto_min; } static inline u32 tcp_rto_min_us(struct sock *sk) { return jiffies_to_usecs(tcp_rto_min(sk)); } static inline bool tcp_ca_dst_locked(const struct dst_entry *dst) { return dst_metric_locked(dst, RTAX_CC_ALGO); } /* Minimum RTT in usec. ~0 means not available. */ static inline u32 tcp_min_rtt(const struct tcp_sock *tp) { return minmax_get(&tp->rtt_min); } /* Compute the actual receive window we are currently advertising. * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; if (win < 0) win = 0; return (u32) win; } /* Choose a new window, without checks for shrinking, and without * scaling applied to the result. The caller does these things * if necessary. This is a "raw" window selection. */ u32 __tcp_select_window(struct sock *sk); void tcp_send_window_probe(struct sock *sk); /* TCP uses 32bit jiffies to save some space. * Note that this is different from tcp_time_stamp, which * historically has been the same until linux-4.13. */ #define tcp_jiffies32 ((u32)jiffies) /* * Deliver a 32bit value for TCP timestamp option (RFC 7323) * It is no longer tied to jiffies, but to 1 ms clock. * Note: double check if you want to use tcp_jiffies32 instead of this. */ #define TCP_TS_HZ 1000 static inline u64 tcp_clock_ns(void) { return ktime_get_ns(); } static inline u64 tcp_clock_us(void) { return div_u64(tcp_clock_ns(), NSEC_PER_USEC); } /* This should only be used in contexts where tp->tcp_mstamp is up to date */ static inline u32 tcp_time_stamp(const struct tcp_sock *tp) { return div_u64(tp->tcp_mstamp, USEC_PER_SEC / TCP_TS_HZ); } /* Convert a nsec timestamp into TCP TSval timestamp (ms based currently) */ static inline u32 tcp_ns_to_ts(u64 ns) { return div_u64(ns, NSEC_PER_SEC / TCP_TS_HZ); } /* Could use tcp_clock_us() / 1000, but this version uses a single divide */ static inline u32 tcp_time_stamp_raw(void) { return tcp_ns_to_ts(tcp_clock_ns()); } void tcp_mstamp_refresh(struct tcp_sock *tp); static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) { return max_t(s64, t1 - t0, 0); } static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) { return tcp_ns_to_ts(skb->skb_mstamp_ns); } /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { return div_u64(skb->skb_mstamp_ns, NSEC_PER_USEC); } #define tcp_flag_byte(th) (((u_int8_t *)th)[13]) #define TCPHDR_FIN 0x01 #define TCPHDR_SYN 0x02 #define TCPHDR_RST 0x04 #define TCPHDR_PSH 0x08 #define TCPHDR_ACK 0x10 #define TCPHDR_URG 0x20 #define TCPHDR_ECE 0x40 #define TCPHDR_CWR 0x80 #define TCPHDR_SYN_ECN (TCPHDR_SYN | TCPHDR_ECE | TCPHDR_CWR) /* This is what the send packet queuing engine uses to pass * TCP per-packet control information to the transmission code. * We also store the host-order sequence numbers in here too. * This is 44 bytes if IPV6 is enabled. * If this grows please adjust skbuff.h:skbuff->cb[xxx] size appropriately. */ struct tcp_skb_cb { __u32 seq; /* Starting sequence number */ __u32 end_seq; /* SEQ + FIN + SYN + datalen */ union { /* Note : tcp_tw_isn is used in input path only * (isn chosen by tcp_timewait_state_process()) * * tcp_gso_segs/size are used in write queue only, * cf tcp_skb_pcount()/tcp_skb_mss() */ __u32 tcp_tw_isn; struct { u16 tcp_gso_segs; u16 tcp_gso_size; }; }; __u8 tcp_flags; /* TCP header flags. (tcp[13]) */ __u8 sacked; /* State flags for SACK. */ #define TCPCB_SACKED_ACKED 0x01 /* SKB ACK'd by a SACK block */ #define TCPCB_SACKED_RETRANS 0x02 /* SKB retransmitted */ #define TCPCB_LOST 0x04 /* SKB is lost */ #define TCPCB_TAGBITS 0x07 /* All tag bits */ #define TCPCB_REPAIRED 0x10 /* SKB repaired (no skb_mstamp_ns) */ #define TCPCB_EVER_RETRANS 0x80 /* Ever retransmitted frame */ #define TCPCB_RETRANS (TCPCB_SACKED_RETRANS|TCPCB_EVER_RETRANS| \ TCPCB_REPAIRED) __u8 ip_dsfield; /* IPv4 tos or IPv6 dsfield */ __u8 txstamp_ack:1, /* Record TX timestamp for ack? */ eor:1, /* Is skb MSG_EOR marked? */ has_rxtstamp:1, /* SKB has a RX timestamp */ unused:5; __u32 ack_seq; /* Sequence number ACK'd */ union { struct { /* There is space for up to 24 bytes */ __u32 in_flight:30,/* Bytes in flight at transmit */ is_app_limited:1, /* cwnd not fully used? */ unused:1; /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ u64 first_tx_mstamp; /* when we reached the "delivered" count */ u64 delivered_mstamp; } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; #if IS_ENABLED(CONFIG_IPV6) struct inet6_skb_parm h6; #endif } header; /* For incoming skbs */ struct { __u32 flags; struct sock *sk_redir; void *data_end; } bpf; }; }; #define TCP_SKB_CB(__skb) ((struct tcp_skb_cb *)&((__skb)->cb[0])) static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb) { TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb); } static inline bool tcp_skb_bpf_ingress(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.flags & BPF_F_INGRESS; } static inline struct sock *tcp_skb_bpf_redirect_fetch(struct sk_buff *skb) { return TCP_SKB_CB(skb)->bpf.sk_redir; } static inline void tcp_skb_bpf_redirect_clear(struct sk_buff *skb) { TCP_SKB_CB(skb)->bpf.sk_redir = NULL; } extern const struct inet_connection_sock_af_ops ipv4_specific; #if IS_ENABLED(CONFIG_IPV6) /* This is the variant of inet6_iif() that must be used by TCP, * as TCP moves IP6CB into a different location in skb->cb[] */ static inline int tcp_v6_iif(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->header.h6.iif; } static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) { bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; } /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v6_sdif(const struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags)) return TCP_SKB_CB(skb)->header.h6.iif; #endif return 0; } extern const struct inet_connection_sock_af_ops ipv6_specific; INDIRECT_CALLABLE_DECLARE(void tcp_v6_send_check(struct sock *sk, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp_v6_rcv(struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(void tcp_v6_early_demux(struct sk_buff *skb)); #endif /* TCP_SKB_CB reference means this can not be used from early demux */ static inline int tcp_v4_sdif(struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return TCP_SKB_CB(skb)->header.h4.iif; #endif return 0; } /* Due to TSO, an SKB can be composed of multiple actual * packets. To keep these tracked properly, we use this. */ static inline int tcp_skb_pcount(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_segs; } static inline void tcp_skb_pcount_set(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs = segs; } static inline void tcp_skb_pcount_add(struct sk_buff *skb, int segs) { TCP_SKB_CB(skb)->tcp_gso_segs += segs; } /* This is valid iff skb is in write queue and tcp_skb_pcount() > 1. */ static inline int tcp_skb_mss(const struct sk_buff *skb) { return TCP_SKB_CB(skb)->tcp_gso_size; } static inline bool tcp_skb_can_collapse_to(const struct sk_buff *skb) { return likely(!TCP_SKB_CB(skb)->eor); } static inline bool tcp_skb_can_collapse(const struct sk_buff *to, const struct sk_buff *from) { return likely(tcp_skb_can_collapse_to(to) && mptcp_skb_can_collapse(to, from)); } /* Events passed to congestion control interface */ enum tcp_ca_event { CA_EVENT_TX_START, /* first transmit when no packets in flight */ CA_EVENT_CWND_RESTART, /* congestion window restart */ CA_EVENT_COMPLETE_CWR, /* end of congestion recovery */ CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ enum tcp_ca_ack_event_flags { CA_ACK_SLOWPATH = (1 << 0), /* In slow path processing */ CA_ACK_WIN_UPDATE = (1 << 1), /* ACK updated window */ CA_ACK_ECE = (1 << 2), /* ECE bit is set on ack */ }; /* * Interface for adding new TCP congestion control handlers */ #define TCP_CA_NAME_MAX 16 #define TCP_CA_MAX 128 #define TCP_CA_BUF_MAX (TCP_CA_NAME_MAX*TCP_CA_MAX) #define TCP_CA_UNSPEC 0 /* Algorithm can be set on socket without CAP_NET_ADMIN privileges */ #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) union tcp_cc_info; struct ack_sample { u32 pkts_acked; s32 rtt_us; u32 in_flight; }; /* A rate sample measures the number of (original/retransmitted) data * packets delivered "delivered" over an interval of time "interval_us". * The tcp_rate.c code fills in the rate sample, and congestion * control modules that define a cong_control function to run at the end * of ACK processing can optionally chose to consult this sample when * setting cwnd and pacing rate. * A sample is invalid if "delivered" or "interval_us" is negative. */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ s32 delivered; /* number of packets delivered over interval */ long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ long rtt_us; /* RTT of last (S)ACKed packet (or -1) */ int losses; /* number of packets marked lost upon ACK */ u32 acked_sacked; /* number of packets newly (S)ACKed upon ACK */ u32 prior_in_flight; /* in flight before this ACK */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ bool is_ack_delayed; /* is this (likely) a delayed ACK? */ }; struct tcp_congestion_ops { struct list_head list; u32 key; u32 flags; /* initialize private data (optional) */ void (*init)(struct sock *sk); /* cleanup private data (optional) */ void (*release)(struct sock *sk); /* return slow start threshold (required) */ u32 (*ssthresh)(struct sock *sk); /* do new cwnd calculation (required) */ void (*cong_avoid)(struct sock *sk, u32 ack, u32 acked); /* call before changing ca_state (optional) */ void (*set_state)(struct sock *sk, u8 new_state); /* call when cwnd event occurs (optional) */ void (*cwnd_event)(struct sock *sk, enum tcp_ca_event ev); /* call when ack arrives (optional) */ void (*in_ack_event)(struct sock *sk, u32 flags); /* new value of cwnd after loss (required) */ u32 (*undo_cwnd)(struct sock *sk); /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); /* override sysctl_tcp_min_tso_segs */ u32 (*min_tso_segs)(struct sock *sk); /* returns the multiplier used in tcp_sndbuf_expand (optional) */ u32 (*sndbuf_expand)(struct sock *sk); /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) */ void (*cong_control)(struct sock *sk, const struct rate_sample *rs); /* get info for inet_diag (optional) */ size_t (*get_info)(struct sock *sk, u32 ext, int *attr, union tcp_cc_info *info); char name[TCP_CA_NAME_MAX]; struct module *owner; }; int tcp_register_congestion_control(struct tcp_congestion_ops *type); void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); void tcp_assign_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk); int tcp_set_default_congestion_control(struct net *net, const char *name); void tcp_get_default_congestion_control(struct net *net, char *name); void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len); int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool cap_net_admin); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); u32 tcp_reno_ssthresh(struct sock *sk); u32 tcp_reno_undo_cwnd(struct sock *sk); void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); extern struct tcp_congestion_ops tcp_reno; struct tcp_congestion_ops *tcp_ca_find(const char *name); struct tcp_congestion_ops *tcp_ca_find_key(u32 key); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca); #ifdef CONFIG_INET char *tcp_ca_get_name_by_key(u32 key, char *buffer); #else static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) { return NULL; } #endif static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); return icsk->icsk_ca_ops->flags & TCP_CONG_NEEDS_ECN; } static inline void tcp_set_ca_state(struct sock *sk, const u8 ca_state) { struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->set_state) icsk->icsk_ca_ops->set_state(sk, ca_state); icsk->icsk_ca_state = ca_state; } static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) { const struct inet_connection_sock *icsk = inet_csk(sk); if (icsk->icsk_ca_ops->cwnd_event) icsk->icsk_ca_ops->cwnd_event(sk, event); } /* From tcp_rate.c */ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, bool is_sack_reneg, struct rate_sample *rs); void tcp_rate_check_app_limited(struct sock *sk); /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. * * tcp_is_sack - SACK enabled * tcp_is_reno - No SACK */ static inline int tcp_is_sack(const struct tcp_sock *tp) { return likely(tp->rx_opt.sack_ok); } static inline bool tcp_is_reno(const struct tcp_sock *tp) { return !tcp_is_sack(tp); } static inline unsigned int tcp_left_out(const struct tcp_sock *tp) { return tp->sacked_out + tp->lost_out; } /* This determines how many packets are "in the network" to the best * of our knowledge. In many cases it is conservative, but where * detailed information is available from the receiver (via SACK * blocks etc.) we can make more aggressive calculations. * * Use this for decisions involving congestion control, use just * tp->packets_out to determine if the send queue is empty or not. * * Read this equation as: * * "Packets sent once on transmission queue" MINUS * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return tp->packets_out - tcp_left_out(tp) + tp->retrans_out; } #define TCP_INFINITE_SSTHRESH 0x7fffffff static inline bool tcp_in_slow_start(const struct tcp_sock *tp) { return tp->snd_cwnd < tp->snd_ssthresh; } static inline bool tcp_in_initial_slowstart(const struct tcp_sock *tp) { return tp->snd_ssthresh >= TCP_INFINITE_SSTHRESH; } static inline bool tcp_in_cwnd_reduction(const struct sock *sk) { return (TCPF_CA_CWR | TCPF_CA_Recovery) & (1 << inet_csk(sk)->icsk_ca_state); } /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is cwnd reduction phase, when cwnd is decreasing towards * ssthresh. */ static inline __u32 tcp_current_ssthresh(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); if (tcp_in_cwnd_reduction(sk)) return tp->snd_ssthresh; else return max(tp->snd_ssthresh, ((tp->snd_cwnd >> 1) + (tp->snd_cwnd >> 2))); } /* Use define here intentionally to get WARN_ON location shown at the caller */ #define tcp_verify_left_out(tp) WARN_ON(tcp_left_out(tp) > tp->packets_out) void tcp_enter_cwr(struct sock *sk); __u32 tcp_init_cwnd(const struct tcp_sock *tp, const struct dst_entry *dst); /* The maximum number of MSS of available cwnd for which TSO defers * sending if not using sysctl_tcp_tso_win_divisor. */ static inline __u32 tcp_max_tso_deferred_mss(const struct tcp_sock *tp) { return 3; } /* Returns end sequence number of the receiver's advertised window */ static inline u32 tcp_wnd_end(const struct tcp_sock *tp) { return tp->snd_una + tp->snd_wnd; } /* We follow the spirit of RFC2861 to validate cwnd but implement a more * flexible approach. The RFC suggests cwnd should not be raised unless * it was fully used previously. And that's exactly what we do in * congestion avoidance mode. But in slow start we allow cwnd to grow * as long as the application has used half the cwnd. * Example : * cwnd is 10 (IW10), but application sends 9 frames. * We allow cwnd to reach 18 when all frames are ACKed. * This check is safe because it's as aggressive as slow start which already * risks 100% overshoot. The advantage is that we discourage application to * either send more filler packets or data to artificially blow up the cwnd * usage, and allow application-limited process to probe bw more aggressively. */ static inline bool tcp_is_cwnd_limited(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); /* If in slow start, ensure cwnd grows to twice what was ACKed. */ if (tcp_in_slow_start(tp)) return tp->snd_cwnd < 2 * tp->max_packets_out; return tp->is_cwnd_limited; } /* BBR congestion control needs pacing. * Same remark for SO_MAX_PACING_RATE. * sch_fq packet scheduler is efficiently handling pacing, * but is not always installed/used. * Return true if TCP stack should pace packets itself. */ static inline bool tcp_needs_internal_pacing(const struct sock *sk) { return smp_load_acquire(&sk->sk_pacing_status) == SK_PACING_NEEDED; } /* Estimates in how many jiffies next packet for this flow can be sent. * Scheduling a retransmit timer too early would be silly. */ static inline unsigned long tcp_pacing_delay(const struct sock *sk) { s64 delay = tcp_sk(sk)->tcp_wstamp_ns - tcp_sk(sk)->tcp_clock_cache; return delay > 0 ? nsecs_to_jiffies(delay) : 0; } static inline void tcp_reset_xmit_timer(struct sock *sk, const int what, unsigned long when, const unsigned long max_when) { inet_csk_reset_xmit_timer(sk, what, when + tcp_pacing_delay(sk), max_when); } /* Something is really bad, we could not queue an additional packet, * because qdisc is full or receiver sent a 0 window, or we are paced. * We do not want to add fuel to the fire, or abort too early, * so make sure the timer we arm now is at least 200ms in the future, * regardless of current icsk_rto value (as it could be ~2ms) */ static inline unsigned long tcp_probe0_base(const struct sock *sk) { return max_t(unsigned long, inet_csk(sk)->icsk_rto, TCP_RTO_MIN); } /* Variant of inet_csk_rto_backoff() used for zero window probes */ static inline unsigned long tcp_probe0_when(const struct sock *sk, unsigned long max_when) { u64 when = (u64)tcp_probe0_base(sk) << inet_csk(sk)->icsk_backoff; return (unsigned long)min_t(u64, when, max_when); } static inline void tcp_check_probe_timer(struct sock *sk) { if (!tcp_sk(sk)->packets_out && !inet_csk(sk)->icsk_pending) tcp_reset_xmit_timer(sk, ICSK_TIME_PROBE0, tcp_probe0_base(sk), TCP_RTO_MAX); } static inline void tcp_init_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } static inline void tcp_update_wl(struct tcp_sock *tp, u32 seq) { tp->snd_wl1 = seq; } /* * Calculate(/check) TCP checksum */ static inline __sum16 tcp_v4_check(int len, __be32 saddr, __be32 daddr, __wsum base) { return csum_tcpudp_magic(saddr, daddr, len, IPPROTO_TCP, base); } static inline bool tcp_checksum_complete(struct sk_buff *skb) { return !skb_csum_unnecessary(skb) && __skb_checksum_complete(skb); } bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); int tcp_filter(struct sock *sk, struct sk_buff *skb); void tcp_set_state(struct sock *sk, int state); void tcp_done(struct sock *sk); int tcp_abort(struct sock *sk, int err); static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->num_sacks = 0; } void tcp_cwnd_restart(struct sock *sk, s32 delta); static inline void tcp_slow_start_after_idle_check(struct sock *sk) { const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); s32 delta; if (!sock_net(sk)->ipv4.sysctl_tcp_slow_start_after_idle || tp->packets_out || ca_ops->cong_control) return; delta = tcp_jiffies32 - tp->lsndtime; if (delta > inet_csk(sk)->icsk_rto) tcp_cwnd_restart(sk, delta); } /* Determine a window scaling and initial window to offer. */ void tcp_select_initial_window(const struct sock *sk, int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, int wscale_ok, __u8 *rcv_wscale, __u32 init_rcv_wnd); static inline int tcp_win_from_space(const struct sock *sk, int space) { int tcp_adv_win_scale = sock_net(sk)->ipv4.sysctl_tcp_adv_win_scale; return tcp_adv_win_scale <= 0 ? (space>>(-tcp_adv_win_scale)) : space - (space>>tcp_adv_win_scale); } /* Note: caller must be prepared to deal with negative returns */ static inline int tcp_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf) - READ_ONCE(sk->sk_backlog.len) - atomic_read(&sk->sk_rmem_alloc)); } static inline int tcp_full_space(const struct sock *sk) { return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); } void tcp_cleanup_rbuf(struct sock *sk, int copied); /* We provision sk_rcvbuf around 200% of sk_rcvlowat. * If 87.5 % (7/8) of the space has been consumed, we want to override * SO_RCVLOWAT constraint, since we are receiving skbs with too small * len/truesize ratio. */ static inline bool tcp_rmem_pressure(const struct sock *sk) { int rcvbuf, threshold; if (tcp_under_memory_pressure(sk)) return true; rcvbuf = READ_ONCE(sk->sk_rcvbuf); threshold = rcvbuf - (rcvbuf >> 3); return atomic_read(&sk->sk_rmem_alloc) > threshold; } extern void tcp_openreq_init_rwin(struct request_sock *req, const struct sock *sk_listener, const struct dst_entry *dst); void tcp_enter_memory_pressure(struct sock *sk); void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->keepalive_intvl ? : net->ipv4.sysctl_tcp_keepalive_intvl; } static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->keepalive_time ? : net->ipv4.sysctl_tcp_keepalive_time; } static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->keepalive_probes ? : net->ipv4.sysctl_tcp_keepalive_probes; } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) { const struct inet_connection_sock *icsk = &tp->inet_conn; return min_t(u32, tcp_jiffies32 - icsk->icsk_ack.lrcvtime, tcp_jiffies32 - tp->rcv_tstamp); } static inline int tcp_fin_time(const struct sock *sk) { int fin_timeout = tcp_sk(sk)->linger2 ? : sock_net(sk)->ipv4.sysctl_tcp_fin_timeout; const int rto = inet_csk(sk)->icsk_rto; if (fin_timeout < (rto << 2) - (rto >> 1)) fin_timeout = (rto << 2) - (rto >> 1); return fin_timeout; } static inline bool tcp_paws_check(const struct tcp_options_received *rx_opt, int paws_win) { if ((s32)(rx_opt->ts_recent - rx_opt->rcv_tsval) <= paws_win) return true; if (unlikely(!time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_24DAYS))) return true; /* * Some OSes send SYN and SYNACK messages with tsval=0 tsecr=0, * then following tcp messages have valid values. Ignore 0 value, * or else 'negative' tsval might forbid us to accept their packets. */ if (!rx_opt->ts_recent) return true; return false; } static inline bool tcp_paws_reject(const struct tcp_options_received *rx_opt, int rst) { if (tcp_paws_check(rx_opt, 0)) return false; /* RST segments are not recommended to carry timestamp, and, if they do, it is recommended to ignore PAWS because "their cleanup function should take precedence over timestamps." Certainly, it is mistake. It is necessary to understand the reasons of this constraint to relax it: if peer reboots, clock may go out-of-sync and half-open connections will not be reset. Actually, the problem would be not existing if all the implementations followed draft about maintaining clock via reboots. Linux-2.2 DOES NOT! However, we can relax time bounds for RST segments to MSL. */ if (rst && !time_before32(ktime_get_seconds(), rx_opt->ts_recent_stamp + TCP_PAWS_MSL)) return false; return true; } bool tcp_oow_rate_limited(struct net *net, const struct sk_buff *skb, int mib_idx, u32 *last_oow_ack_time); static inline void tcp_mib_init(struct net *net) { /* See RFC 2012 */ TCP_ADD_STATS(net, TCP_MIB_RTOALGORITHM, 1); TCP_ADD_STATS(net, TCP_MIB_RTOMIN, TCP_RTO_MIN*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_RTOMAX, TCP_RTO_MAX*1000/HZ); TCP_ADD_STATS(net, TCP_MIB_MAXCONN, -1); } /* from STCP */ static inline void tcp_clear_retrans_hints_partial(struct tcp_sock *tp) { tp->lost_skb_hint = NULL; } static inline void tcp_clear_all_retrans_hints(struct tcp_sock *tp) { tcp_clear_retrans_hints_partial(tp); tp->retransmit_skb_hint = NULL; } union tcp_md5_addr { struct in_addr a4; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr a6; #endif }; /* - key database */ struct tcp_md5sig_key { struct hlist_node node; u8 keylen; u8 family; /* AF_INET or AF_INET6 */ u8 prefixlen; union tcp_md5_addr addr; int l3index; /* set if key added with L3 scope */ u8 key[TCP_MD5SIG_MAXKEYLEN]; struct rcu_head rcu; }; /* - sock block */ struct tcp_md5sig_info { struct hlist_head head; struct rcu_head rcu; }; /* - pseudo header */ struct tcp4_pseudohdr { __be32 saddr; __be32 daddr; __u8 pad; __u8 protocol; __be16 len; }; struct tcp6_pseudohdr { struct in6_addr saddr; struct in6_addr daddr; __be32 len; __be32 protocol; /* including padding */ }; union tcp_md5sum_block { struct tcp4_pseudohdr ip4; #if IS_ENABLED(CONFIG_IPV6) struct tcp6_pseudohdr ip6; #endif }; /* - pool: digest algorithm, hash description and scratch buffer */ struct tcp_md5sig_pool { struct ahash_request *md5_req; void *scratch; }; /* - functions */ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, const struct sock *sk, const struct sk_buff *skb); int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index, const u8 *newkey, u8 newkeylen, gfp_t gfp); int tcp_md5_do_del(struct sock *sk, const union tcp_md5_addr *addr, int family, u8 prefixlen, int l3index); struct tcp_md5sig_key *tcp_v4_md5_lookup(const struct sock *sk, const struct sock *addr_sk); #ifdef CONFIG_TCP_MD5SIG #include <linux/jump_label.h> extern struct static_key_false tcp_md5_needed; struct tcp_md5sig_key *__tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family); static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { if (!static_branch_unlikely(&tcp_md5_needed)) return NULL; return __tcp_md5_do_lookup(sk, l3index, addr, family); } #define tcp_twsk_md5_key(twsk) ((twsk)->tw_md5_key) #else static inline struct tcp_md5sig_key * tcp_md5_do_lookup(const struct sock *sk, int l3index, const union tcp_md5_addr *addr, int family) { return NULL; } #define tcp_twsk_md5_key(twsk) NULL #endif bool tcp_alloc_md5sig_pool(void); struct tcp_md5sig_pool *tcp_get_md5sig_pool(void); static inline void tcp_put_md5sig_pool(void) { local_bh_enable(); } int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *, unsigned int header_len); int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *key); /* From tcp_fastopen.c */ void tcp_fastopen_cache_get(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); void tcp_fastopen_cache_set(struct sock *sk, u16 mss, struct tcp_fastopen_cookie *cookie, bool syn_lost, u16 try_exp); struct tcp_fastopen_request { /* Fast Open cookie. Size 0 means a cookie request */ struct tcp_fastopen_cookie cookie; struct msghdr *data; /* data in MSG_FASTOPEN */ size_t size; int copied; /* queued in tcp_connect() */ struct ubuf_info *uarg; }; void tcp_free_fastopen_req(struct tcp_sock *tp); void tcp_fastopen_destroy_cipher(struct sock *sk); void tcp_fastopen_ctx_destroy(struct net *net); int tcp_fastopen_reset_cipher(struct net *net, struct sock *sk, void *primary_key, void *backup_key); int tcp_fastopen_get_cipher(struct net *net, struct inet_connection_sock *icsk, u64 *key); void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb); struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct tcp_fastopen_cookie *foc, const struct dst_entry *dst); void tcp_fastopen_init_key_once(struct net *net); bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss, struct tcp_fastopen_cookie *cookie); bool tcp_fastopen_defer_connect(struct sock *sk, int *err); #define TCP_FASTOPEN_KEY_LENGTH sizeof(siphash_key_t) #define TCP_FASTOPEN_KEY_MAX 2 #define TCP_FASTOPEN_KEY_BUF_LENGTH \ (TCP_FASTOPEN_KEY_LENGTH * TCP_FASTOPEN_KEY_MAX) /* Fastopen key context */ struct tcp_fastopen_context { siphash_key_t key[TCP_FASTOPEN_KEY_MAX]; int num; struct rcu_head rcu; }; extern unsigned int sysctl_tcp_fastopen_blackhole_timeout; void tcp_fastopen_active_disable(struct sock *sk); bool tcp_fastopen_active_should_disable(struct sock *sk); void tcp_fastopen_active_disable_ofo_check(struct sock *sk); void tcp_fastopen_active_detect_blackhole(struct sock *sk, bool expired); /* Caller needs to wrap with rcu_read_(un)lock() */ static inline struct tcp_fastopen_context *tcp_fastopen_get_ctx(const struct sock *sk) { struct tcp_fastopen_context *ctx; ctx = rcu_dereference(inet_csk(sk)->icsk_accept_queue.fastopenq.ctx); if (!ctx) ctx = rcu_dereference(sock_net(sk)->ipv4.tcp_fastopen_ctx); return ctx; } static inline bool tcp_fastopen_cookie_match(const struct tcp_fastopen_cookie *foc, const struct tcp_fastopen_cookie *orig) { if (orig->len == TCP_FASTOPEN_COOKIE_SIZE && orig->len == foc->len && !memcmp(orig->val, foc->val, foc->len)) return true; return false; } static inline int tcp_fastopen_context_len(const struct tcp_fastopen_context *ctx) { return ctx->num; } /* Latencies incurred by various limits for a sender. They are * chronograph-like stats that are mutually exclusive. */ enum tcp_chrono { TCP_CHRONO_UNSPEC, TCP_CHRONO_BUSY, /* Actively sending data (non-empty write queue) */ TCP_CHRONO_RWND_LIMITED, /* Stalled by insufficient receive window */ TCP_CHRONO_SNDBUF_LIMITED, /* Stalled by insufficient send buffer */ __TCP_CHRONO_MAX, }; void tcp_chrono_start(struct sock *sk, const enum tcp_chrono type); void tcp_chrono_stop(struct sock *sk, const enum tcp_chrono type); /* This helper is needed, because skb->tcp_tsorted_anchor uses * the same memory storage than skb->destructor/_skb_refdst */ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb) { skb->destructor = NULL; skb->_skb_refdst = 0UL; } #define tcp_skb_tsorted_save(skb) { \ unsigned long _save = skb->_skb_refdst; \ skb->_skb_refdst = 0UL; #define tcp_skb_tsorted_restore(skb) \ skb->_skb_refdst = _save; \ } void tcp_write_queue_purge(struct sock *sk); static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk) { return skb_rb_first(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_rtx_queue_tail(const struct sock *sk) { return skb_rb_last(&sk->tcp_rtx_queue); } static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); } static inline struct sk_buff *tcp_write_queue_tail(const struct sock *sk) { return skb_peek_tail(&sk->sk_write_queue); } #define tcp_for_write_queue_from_safe(skb, tmp, sk) \ skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp) static inline struct sk_buff *tcp_send_head(const struct sock *sk) { return skb_peek(&sk->sk_write_queue); } static inline bool tcp_skb_is_last(const struct sock *sk, const struct sk_buff *skb) { return skb_queue_is_last(&sk->sk_write_queue, skb); } /** * tcp_write_queue_empty - test if any payload (or FIN) is available in write queue * @sk: socket * * Since the write queue can have a temporary empty skb in it, * we must not use "return skb_queue_empty(&sk->sk_write_queue)" */ static inline bool tcp_write_queue_empty(const struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); return tp->write_seq == tp->snd_nxt; } static inline bool tcp_rtx_queue_empty(const struct sock *sk) { return RB_EMPTY_ROOT(&sk->tcp_rtx_queue); } static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk) { return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk); } static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb) { __skb_queue_tail(&sk->sk_write_queue, skb); /* Queue it, remembering where we must start sending. */ if (sk->sk_write_queue.next == skb) tcp_chrono_start(sk, TCP_CHRONO_BUSY); } /* Insert new before skb on the write queue of sk. */ static inline void tcp_insert_write_queue_before(struct sk_buff *new, struct sk_buff *skb, struct sock *sk) { __skb_queue_before(&sk->sk_write_queue, skb, new); } static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); __skb_unlink(skb, &sk->sk_write_queue); } void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb); static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk) { tcp_skb_tsorted_anchor_cleanup(skb); rb_erase(&skb->rbnode, &sk->tcp_rtx_queue); } static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk) { list_del(&skb->tcp_tsorted_anchor); tcp_rtx_queue_unlink(skb, sk); sk_wmem_free_skb(sk, skb); } static inline void tcp_push_pending_frames(struct sock *sk) { if (tcp_send_head(sk)) { struct tcp_sock *tp = tcp_sk(sk); __tcp_push_pending_frames(sk, tcp_current_mss(sk), tp->nonagle); } } /* Start sequence of the skb just after the highest skb with SACKed * bit, valid only if sacked_out > 0 or when the caller has ensured * validity by itself. */ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp) { if (!tp->sacked_out) return tp->snd_una; if (tp->highest_sack == NULL) return tp->snd_nxt; return TCP_SKB_CB(tp->highest_sack)->seq; } static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb) { tcp_sk(sk)->highest_sack = skb_rb_next(skb); } static inline struct sk_buff *tcp_highest_sack(struct sock *sk) { return tcp_sk(sk)->highest_sack; } static inline void tcp_highest_sack_reset(struct sock *sk) { tcp_sk(sk)->highest_sack = tcp_rtx_queue_head(sk); } /* Called when old skb is about to be deleted and replaced by new skb */ static inline void tcp_highest_sack_replace(struct sock *sk, struct sk_buff *old, struct sk_buff *new) { if (old == tcp_highest_sack(sk)) tcp_sk(sk)->highest_sack = new; } /* This helper checks if socket has IP_TRANSPARENT set */ static inline bool inet_sk_transparent(const struct sock *sk) { switch (sk->sk_state) { case TCP_TIME_WAIT: return inet_twsk(sk)->tw_transparent; case TCP_NEW_SYN_RECV: return inet_rsk(inet_reqsk(sk))->no_srccheck; } return inet_sk(sk)->transparent; } /* Determines whether this is a thin stream (which may suffer from * increased latency). Used to trigger latency-reducing mechanisms. */ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) { return tp->packets_out < 4 && !tcp_in_initial_slowstart(tp); } /* /proc */ enum tcp_seq_states { TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_ESTABLISHED, }; void *tcp_seq_start(struct seq_file *seq, loff_t *pos); void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos); void tcp_seq_stop(struct seq_file *seq, void *v); struct tcp_seq_afinfo { sa_family_t family; }; struct tcp_iter_state { struct seq_net_private p; enum tcp_seq_states state; struct sock *syn_wait_sk; struct tcp_seq_afinfo *bpf_seq_afinfo; int bucket, offset, sbucket, num; loff_t last_pos; }; extern struct request_sock_ops tcp_request_sock_ops; extern struct request_sock_ops tcp6_request_sock_ops; void tcp_v4_destroy_sock(struct sock *sk); struct sk_buff *tcp_gso_segment(struct sk_buff *skb, netdev_features_t features); struct sk_buff *tcp_gro_receive(struct list_head *head, struct sk_buff *skb); INDIRECT_CALLABLE_DECLARE(int tcp4_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp4_gro_receive(struct list_head *head, struct sk_buff *skb)); INDIRECT_CALLABLE_DECLARE(int tcp6_gro_complete(struct sk_buff *skb, int thoff)); INDIRECT_CALLABLE_DECLARE(struct sk_buff *tcp6_gro_receive(struct list_head *head, struct sk_buff *skb)); int tcp_gro_complete(struct sk_buff *skb); void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); return tp->notsent_lowat ?: net->ipv4.sysctl_tcp_notsent_lowat; } /* @wake is one when sk_stream_write_space() calls us. * This sends EPOLLOUT only if notsent_bytes is half the limit. * This mimics the strategy used in sock_def_write_space(). */ static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) { const struct tcp_sock *tp = tcp_sk(sk); u32 notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt); return (notsent_bytes << wake) < tcp_notsent_lowat(tp); } #ifdef CONFIG_PROC_FS int tcp4_proc_init(void); void tcp4_proc_exit(void); #endif int tcp_rtx_synack(const struct sock *sk, struct request_sock *req); int tcp_conn_request(struct request_sock_ops *rsk_ops, const struct tcp_request_sock_ops *af_ops, struct sock *sk, struct sk_buff *skb); /* TCP af-specific functions */ struct tcp_sock_af_ops { #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*md5_lookup) (const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash)(char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); int (*md5_parse)(struct sock *sk, int optname, sockptr_t optval, int optlen); #endif }; struct tcp_request_sock_ops { u16 mss_clamp; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *(*req_md5_lookup)(const struct sock *sk, const struct sock *addr_sk); int (*calc_md5_hash) (char *location, const struct tcp_md5sig_key *md5, const struct sock *sk, const struct sk_buff *skb); #endif void (*init_req)(struct request_sock *req, const struct sock *sk_listener, struct sk_buff *skb); #ifdef CONFIG_SYN_COOKIES __u32 (*cookie_init_seq)(const struct sk_buff *skb, __u16 *mss); #endif struct dst_entry *(*route_req)(const struct sock *sk, struct flowi *fl, const struct request_sock *req); u32 (*init_seq)(const struct sk_buff *skb); u32 (*init_ts_off)(const struct net *net, const struct sk_buff *skb); int (*send_synack)(const struct sock *sk, struct dst_entry *dst, struct flowi *fl, struct request_sock *req, struct tcp_fastopen_cookie *foc, enum tcp_synack_type synack_type, struct sk_buff *syn_skb); }; extern const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops; #if IS_ENABLED(CONFIG_IPV6) extern const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops; #endif #ifdef CONFIG_SYN_COOKIES static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { tcp_synq_overflow(sk); __NET_INC_STATS(sock_net(sk), LINUX_MIB_SYNCOOKIESSENT); return ops->cookie_init_seq(skb, mss); } #else static inline __u32 cookie_init_sequence(const struct tcp_request_sock_ops *ops, const struct sock *sk, struct sk_buff *skb, __u16 *mss) { return 0; } #endif int tcpv4_offload_init(void); void tcp_v4_init(void); void tcp_init(void); /* tcp_recovery.c */ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb); void tcp_newreno_mark_lost(struct sock *sk, bool snd_una_advanced); extern s32 tcp_rack_skb_timeout(struct tcp_sock *tp, struct sk_buff *skb, u32 reo_wnd); extern bool tcp_rack_mark_lost(struct sock *sk); extern void tcp_rack_advance(struct tcp_sock *tp, u8 sacked, u32 end_seq, u64 xmit_time); extern void tcp_rack_reo_timeout(struct sock *sk); extern void tcp_rack_update_reo_wnd(struct sock *sk, struct rate_sample *rs); /* At how many usecs into the future should the RTO fire? */ static inline s64 tcp_rto_delta_us(const struct sock *sk) { const struct sk_buff *skb = tcp_rtx_queue_head(sk); u32 rto = inet_csk(sk)->icsk_rto; u64 rto_time_stamp_us = tcp_skb_timestamp_us(skb) + jiffies_to_usecs(rto); return rto_time_stamp_us - tcp_sk(sk)->tcp_mstamp; } /* * Save and compile IPv4 options, return a pointer to it */ static inline struct ip_options_rcu *tcp_v4_save_options(struct net *net, struct sk_buff *skb) { const struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options_rcu *dopt = NULL; if (opt->optlen) { int opt_size = sizeof(*dopt) + opt->optlen; dopt = kmalloc(opt_size, GFP_ATOMIC); if (dopt && __ip_options_echo(net, &dopt->opt, skb, opt)) { kfree(dopt); dopt = NULL; } } return dopt; } /* locally generated TCP pure ACKs have skb->truesize == 2 * (check tcp_send_ack() in net/ipv4/tcp_output.c ) * This is much faster than dissecting the packet to find out. * (Think of GRE encapsulations, IPv4, IPv6, ...) */ static inline bool skb_is_tcp_pure_ack(const struct sk_buff *skb) { return skb->truesize == 2; } static inline void skb_set_tcp_pure_ack(struct sk_buff *skb) { skb->truesize = 2; } static inline int tcp_inq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); int answ; if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { answ = 0; } else if (sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data || before(tp->urg_seq, tp->copied_seq) || !before(tp->urg_seq, tp->rcv_nxt)) { answ = tp->rcv_nxt - tp->copied_seq; /* Subtract 1, if FIN was received */ if (answ && sock_flag(sk, SOCK_DONE)) answ--; } else { answ = tp->urg_seq - tp->copied_seq; } return answ; } int tcp_peek_len(struct socket *sock); static inline void tcp_segs_in(struct tcp_sock *tp, const struct sk_buff *skb) { u16 segs_in; segs_in = max_t(u16, 1, skb_shinfo(skb)->gso_segs); tp->segs_in += segs_in; if (skb->len > tcp_hdrlen(skb)) tp->data_segs_in += segs_in; } /* * TCP listen path runs lockless. * We forced "struct sock" to be const qualified to make sure * we don't modify one of its field by mistake. * Here, we increment sk_drops which is an atomic_t, so we can safely * make sock writable again. */ static inline void tcp_listendrop(const struct sock *sk) { atomic_inc(&((struct sock *)sk)->sk_drops); __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS); } enum hrtimer_restart tcp_pace_kick(struct hrtimer *timer); /* * Interface for adding Upper Level Protocols over TCP */ #define TCP_ULP_NAME_MAX 16 #define TCP_ULP_MAX 128 #define TCP_ULP_BUF_MAX (TCP_ULP_NAME_MAX*TCP_ULP_MAX) struct tcp_ulp_ops { struct list_head list; /* initialize ulp */ int (*init)(struct sock *sk); /* update ulp */ void (*update)(struct sock *sk, struct proto *p, void (*write_space)(struct sock *sk)); /* cleanup ulp */ void (*release)(struct sock *sk); /