1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ /* * Access vector cache interface for object managers. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SELINUX_AVC_H_ #define _SELINUX_AVC_H_ #include <linux/stddef.h> #include <linux/errno.h> #include <linux/kernel.h> #include <linux/kdev_t.h> #include <linux/spinlock.h> #include <linux/init.h> #include <linux/audit.h> #include <linux/lsm_audit.h> #include <linux/in6.h> #include "flask.h" #include "av_permissions.h" #include "security.h" /* * An entry in the AVC. */ struct avc_entry; struct task_struct; struct inode; struct sock; struct sk_buff; /* * AVC statistics */ struct avc_cache_stats { unsigned int lookups; unsigned int misses; unsigned int allocations; unsigned int reclaims; unsigned int frees; }; /* * We only need this data after we have decided to send an audit message. */ struct selinux_audit_data { u32 ssid; u32 tsid; u16 tclass; u32 requested; u32 audited; u32 denied; int result; struct selinux_state *state; }; /* * AVC operations */ void __init avc_init(void); static inline u32 avc_audit_required(u32 requested, struct av_decision *avd, int result, u32 auditdeny, u32 *deniedp) { u32 denied, audited; denied = requested & ~avd->allowed; if (unlikely(denied)) { audited = denied & avd->auditdeny; /* * auditdeny is TRICKY! Setting a bit in * this field means that ANY denials should NOT be audited if * the policy contains an explicit dontaudit rule for that * permission. Take notice that this is unrelated to the * actual permissions that were denied. As an example lets * assume: * * denied == READ * avd.auditdeny & ACCESS == 0 (not set means explicit rule) * auditdeny & ACCESS == 1 * * We will NOT audit the denial even though the denied * permission was READ and the auditdeny checks were for * ACCESS */ if (auditdeny && !(auditdeny & avd->auditdeny)) audited = 0; } else if (result) audited = denied = requested; else audited = requested & avd->auditallow; *deniedp = denied; return audited; } int slow_avc_audit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, u32 audited, u32 denied, int result, struct common_audit_data *a); /** * avc_audit - Audit the granting or denial of permissions. * @ssid: source security identifier * @tsid: target security identifier * @tclass: target security class * @requested: requested permissions * @avd: access vector decisions * @result: result from avc_has_perm_noaudit * @a: auxiliary audit data * @flags: VFS walk flags * * Audit the granting or denial of permissions in accordance * with the policy. This function is typically called by * avc_has_perm() after a permission check, but can also be * called directly by callers who use avc_has_perm_noaudit() * in order to separate the permission check from the auditing. * For example, this separation is useful when the permission check must * be performed under a lock, to allow the lock to be released * before calling the auditing code. */ static inline int avc_audit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct av_decision *avd, int result, struct common_audit_data *a, int flags) { u32 audited, denied; audited = avc_audit_required(requested, avd, result, 0, &denied); if (likely(!audited)) return 0; /* fall back to ref-walk if we have to generate audit */ if (flags & MAY_NOT_BLOCK) return -ECHILD; return slow_avc_audit(state, ssid, tsid, tclass, requested, audited, denied, result, a); } #define AVC_STRICT 1 /* Ignore permissive mode. */ #define AVC_EXTENDED_PERMS 2 /* update extended permissions */ #define AVC_NONBLOCKING 4 /* non blocking */ int avc_has_perm_noaudit(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, unsigned flags, struct av_decision *avd); int avc_has_perm(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata); int avc_has_perm_flags(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, struct common_audit_data *auditdata, int flags); int avc_has_extended_perms(struct selinux_state *state, u32 ssid, u32 tsid, u16 tclass, u32 requested, u8 driver, u8 perm, struct common_audit_data *ad); u32 avc_policy_seqno(struct selinux_state *state); #define AVC_CALLBACK_GRANT 1 #define AVC_CALLBACK_TRY_REVOKE 2 #define AVC_CALLBACK_REVOKE 4 #define AVC_CALLBACK_RESET 8 #define AVC_CALLBACK_AUDITALLOW_ENABLE 16 #define AVC_CALLBACK_AUDITALLOW_DISABLE 32 #define AVC_CALLBACK_AUDITDENY_ENABLE 64 #define AVC_CALLBACK_AUDITDENY_DISABLE 128 #define AVC_CALLBACK_ADD_XPERMS 256 int avc_add_callback(int (*callback)(u32 event), u32 events); /* Exported to selinuxfs */ struct selinux_avc; int avc_get_hash_stats(struct selinux_avc *avc, char *page); unsigned int avc_get_cache_threshold(struct selinux_avc *avc); void avc_set_cache_threshold(struct selinux_avc *avc, unsigned int cache_threshold); /* Attempt to free avc node cache */ void avc_disable(void); #ifdef CONFIG_SECURITY_SELINUX_AVC_STATS DECLARE_PER_CPU(struct avc_cache_stats, avc_cache_stats); #endif #endif /* _SELINUX_AVC_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMSTAT_H #define _LINUX_VMSTAT_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> #include <linux/static_key.h> #include <linux/mmdebug.h> extern int sysctl_stat_interval; #ifdef CONFIG_NUMA #define ENABLE_NUMA_STAT 1 #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { unsigned nr_dirty; unsigned nr_unqueued_dirty; unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; }; enum writeback_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, NR_VM_WRITEBACK_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. * * Counters should only be incremented and no critical kernel component * should rely on the counter values. * * Counters are handled completely inline. On many platforms the code * generated will simply be the increment of a global address. */ struct vm_event_state { unsigned long event[NR_VM_EVENT_ITEMS]; }; DECLARE_PER_CPU(struct vm_event_state, vm_event_states); /* * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the * local_irq_disable overhead. */ static inline void __count_vm_event(enum vm_event_item item) { raw_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { raw_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); extern void vm_events_fold_cpu(int cpu); #else /* Disable counters */ static inline void count_vm_event(enum vm_event_item item) { } static inline void count_vm_events(enum vm_event_item item, long delta) { } static inline void __count_vm_event(enum vm_event_item item) { } static inline void __count_vm_events(enum vm_event_item item, long delta) { } static inline void all_vm_events(unsigned long *ret) { } static inline void vm_events_fold_cpu(int cpu) { } #endif /* CONFIG_VM_EVENT_COUNTERS */ #ifdef CONFIG_NUMA_BALANCING #define count_vm_numa_event(x) count_vm_event(x) #define count_vm_numa_events(x, y) count_vm_events(x, y) #else #define count_vm_numa_event(x) do {} while (0) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_DEBUG_TLBFLUSH #define count_vm_tlb_event(x) count_vm_event(x) #define count_vm_tlb_events(x, y) count_vm_events(x, y) #else #define count_vm_tlb_event(x) do {} while (0) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif #ifdef CONFIG_DEBUG_VM_VMACACHE #define count_vm_vmacache_event(x) count_vm_event(x) #else #define count_vm_vmacache_event(x) do {} while (0) #endif #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) /* * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA static inline void zone_numa_state_add(long x, struct zone *zone, enum numa_stat_item item) { atomic_long_add(x, &zone->vm_numa_stat[item]); atomic_long_add(x, &vm_numa_stat[item]); } static inline unsigned long global_numa_state(enum numa_stat_item item) { long x = atomic_long_read(&vm_numa_stat[item]); return x; } static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum numa_stat_item item) { long x = atomic_long_read(&zone->vm_numa_stat[item]); int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } #endif /* CONFIG_NUMA */ static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &vm_zone_stat[item]); } static inline void node_page_state_add(long x, struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_add(x, &pgdat->vm_stat[item]); atomic_long_add(x, &vm_node_stat[item]); } static inline unsigned long global_zone_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_zone_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state_pages(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state(enum node_stat_item item) { VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); return global_node_page_state_pages(item); } static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } /* * More accurate version that also considers the currently pending * deltas. For that we need to loop over all cpus to find the current * deltas. There is no synchronization so the result cannot be * exactly accurate either. */ static inline unsigned long zone_page_state_snapshot(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; #endif return x; } #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); extern unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #define node_page_state_pages(node, item) global_node_page_state_pages(item) #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); void __inc_node_page_state(struct page *, enum node_stat_item); void __dec_node_page_state(struct page *, enum node_stat_item); void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); void set_pgdat_percpu_threshold(pg_data_t *pgdat, int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* * We do not maintain differentials in a single processor configuration. * The functions directly modify the zone and global counters. */ static inline void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { zone_page_state_add(delta, zone, item); } static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { if (vmstat_item_in_bytes(item)) { VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } node_page_state_add(delta, pgdat, item); } static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_inc(&zone->vm_stat[item]); atomic_long_inc(&vm_zone_stat[item]); } static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_inc(&pgdat->vm_stat[item]); atomic_long_inc(&vm_node_stat[item]); } static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_zone_stat[item]); } static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_dec(&pgdat->vm_stat[item]); atomic_long_dec(&vm_node_stat[item]); } static inline void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } static inline void __inc_node_page_state(struct page *page, enum node_stat_item item) { __inc_node_state(page_pgdat(page), item); } static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } static inline void __dec_node_page_state(struct page *page, enum node_stat_item item) { __dec_node_state(page_pgdat(page), item); } /* * We only use atomic operations to update counters. So there is no need to * disable interrupts. */ #define inc_zone_page_state __inc_zone_page_state #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state #define inc_node_page_state __inc_node_page_state #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state #define inc_zone_state __inc_zone_state #define inc_node_state __inc_node_state #define dec_zone_state __dec_zone_state #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } #endif /* CONFIG_SMP */ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) { return vmstat_text[item]; } #ifdef CONFIG_NUMA static inline const char *numa_stat_name(enum numa_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + item]; } #endif /* CONFIG_NUMA */ static inline const char *node_stat_name(enum node_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_STAT_ITEMS + item]; } static inline const char *lru_list_name(enum lru_list lru) { return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } static inline const char *writeback_stat_name(enum writeback_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_STAT_ITEMS + NR_VM_NODE_STAT_ITEMS + item]; } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_STAT_ITEMS + NR_VM_NODE_STAT_ITEMS + NR_VM_WRITEBACK_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ #endif /* _LINUX_VMSTAT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved * Copyright 2005-2006 Ian Kent <raven@themaw.net> */ /* Internal header file for autofs */ #include <linux/auto_fs.h> #include <linux/auto_dev-ioctl.h> #include <linux/kernel.h> #include <linux/slab.h> #include <linux/time.h> #include <linux/string.h> #include <linux/wait.h> #include <linux/sched.h> #include <linux/sched/signal.h> #include <linux/mount.h> #include <linux/namei.h> #include <linux/uaccess.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/list.h> #include <linux/completion.h> #include <linux/file.h> #include <linux/magic.h> /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY #define AUTOFS_IOC_COUNT 32 #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) #define AUTOFS_DEV_IOCTL_IOC_COUNT \ (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) #ifdef pr_fmt #undef pr_fmt #endif #define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__ extern struct file_system_type autofs_fs_type; /* * Unified info structure. This is pointed to by both the dentry and * inode structures. Each file in the filesystem has an instance of this * structure. It holds a reference to the dentry, so dentries are never * flushed while the file exists. All name lookups are dealt with at the * dentry level, although the filesystem can interfere in the validation * process. Readdir is implemented by traversing the dentry lists. */ struct autofs_info { struct dentry *dentry; struct inode *inode; int flags; struct completion expire_complete; struct list_head active; struct list_head expiring; struct autofs_sb_info *sbi; unsigned long last_used; int count; kuid_t uid; kgid_t gid; struct rcu_head rcu; }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */ #define AUTOFS_INF_WANT_EXPIRE (1<<1) /* the dentry is being considered * for expiry, so RCU_walk is * not permitted. If it progresses to * actual expiry attempt, the flag is * not cleared when EXPIRING is set - * in that case it gets cleared only * when it comes to clearing EXPIRING. */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ struct autofs_wait_queue { wait_queue_head_t queue; struct autofs_wait_queue *next; autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ struct qstr name; u32 dev; u64 ino; kuid_t uid; kgid_t gid; pid_t pid; pid_t tgid; /* This is for status reporting upon return */ int status; unsigned int wait_ctr; }; #define AUTOFS_SBI_MAGIC 0x6d4a556d #define AUTOFS_SBI_CATATONIC 0x0001 #define AUTOFS_SBI_STRICTEXPIRE 0x0002 #define AUTOFS_SBI_IGNORE 0x0004 struct autofs_sb_info { u32 magic; int pipefd; struct file *pipe; struct pid *oz_pgrp; int version; int sub_version; int min_proto; int max_proto; unsigned int flags; unsigned long exp_timeout; unsigned int type; struct super_block *sb; struct mutex wq_mutex; struct mutex pipe_mutex; spinlock_t fs_lock; struct autofs_wait_queue *queues; /* Wait queue pointer */ spinlock_t lookup_lock; struct list_head active_list; struct list_head expiring_list; struct rcu_head rcu; }; static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { return (struct autofs_sb_info *)(sb->s_fs_info); } static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) { return (struct autofs_info *)(dentry->d_fsdata); } /* autofs_oz_mode(): do we see the man behind the curtain? (The * processes which do manipulations for us in user space sees the raw * filesystem without "magic".) */ static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { return ((sbi->flags & AUTOFS_SBI_CATATONIC) || task_pgrp(current) == sbi->oz_pgrp); } struct inode *autofs_get_inode(struct super_block *, umode_t); void autofs_free_ino(struct autofs_info *); /* Expiration */ int is_autofs_dentry(struct dentry *); int autofs_expire_wait(const struct path *path, int rcu_walk); int autofs_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, unsigned int how); int autofs_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); /* Device node initialization */ int autofs_dev_ioctl_init(void); void autofs_dev_ioctl_exit(void); /* Operations structures */ extern const struct inode_operations autofs_symlink_inode_operations; extern const struct inode_operations autofs_dir_inode_operations; extern const struct file_operations autofs_dir_operations; extern const struct file_operations autofs_root_operations; extern const struct dentry_operations autofs_dentry_operations; /* VFS automount flags management functions */ static inline void __managed_dentry_set_managed(struct dentry *dentry) { dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); } static inline void managed_dentry_set_managed(struct dentry *dentry) { spin_lock(&dentry->d_lock); __managed_dentry_set_managed(dentry); spin_unlock(&dentry->d_lock); } static inline void __managed_dentry_clear_managed(struct dentry *dentry) { dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); } static inline void managed_dentry_clear_managed(struct dentry *dentry) { spin_lock(&dentry->d_lock); __managed_dentry_clear_managed(dentry); spin_unlock(&dentry->d_lock); } /* Initializing function */ int autofs_fill_super(struct super_block *, void *, int); struct autofs_info *autofs_new_ino(struct autofs_sb_info *); void autofs_clean_ino(struct autofs_info *); static inline int autofs_prepare_pipe(struct file *pipe) { if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) return -EINVAL; /* We want a packet pipe */ pipe->f_flags |= O_DIRECT; /* We don't expect -EAGAIN */ pipe->f_flags &= ~O_NONBLOCK; return 0; } /* Queue management functions */ int autofs_wait(struct autofs_sb_info *, const struct path *, enum autofs_notify); int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); void autofs_catatonic_mode(struct autofs_sb_info *); static inline u32 autofs_get_dev(struct autofs_sb_info *sbi) { return new_encode_dev(sbi->sb->s_dev); } static inline u64 autofs_get_ino(struct autofs_sb_info *sbi) { return d_inode(sbi->sb->s_root)->i_ino; } static inline void __autofs_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); } } static inline void autofs_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (list_empty(&ino->expiring)) list_add(&ino->expiring, &sbi->expiring_list); spin_unlock(&sbi->lookup_lock); } } static inline void autofs_del_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (!list_empty(&ino->expiring)) list_del_init(&ino->expiring); spin_unlock(&sbi->lookup_lock); } } void autofs_kill_sb(struct super_block *);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SWAP_H #define _LINUX_SWAP_H #include <linux/spinlock.h> #include <linux/linkage.h> #include <linux/mmzone.h> #include <linux/list.h> #include <linux/memcontrol.h> #include <linux/sched.h> #include <linux/node.h> #include <linux/fs.h> #include <linux/atomic.h> #include <linux/page-flags.h> #include <asm/page.h> struct notifier_block; struct bio; struct pagevec; #define SWAP_FLAG_PREFER 0x8000 /* set if swap priority specified */ #define SWAP_FLAG_PRIO_MASK 0x7fff #define SWAP_FLAG_PRIO_SHIFT 0 #define SWAP_FLAG_DISCARD 0x10000 /* enable discard for swap */ #define SWAP_FLAG_DISCARD_ONCE 0x20000 /* discard swap area at swapon-time */ #define SWAP_FLAG_DISCARD_PAGES 0x40000 /* discard page-clusters after use */ #define SWAP_FLAGS_VALID (SWAP_FLAG_PRIO_MASK | SWAP_FLAG_PREFER | \ SWAP_FLAG_DISCARD | SWAP_FLAG_DISCARD_ONCE | \ SWAP_FLAG_DISCARD_PAGES) #define SWAP_BATCH 64 static inline int current_is_kswapd(void) { return current->flags & PF_KSWAPD; } /* * MAX_SWAPFILES defines the maximum number of swaptypes: things which can * be swapped to. The swap type and the offset into that swap type are * encoded into pte's and into pgoff_t's in the swapcache. Using five bits * for the type means that the maximum number of swapcache pages is 27 bits * on 32-bit-pgoff_t architectures. And that assumes that the architecture packs * the type/offset into the pte as 5/27 as well. */ #define MAX_SWAPFILES_SHIFT 5 /* * Use some of the swap files numbers for other purposes. This * is a convenient way to hook into the VM to trigger special * actions on faults. */ /* * Unaddressable device memory support. See include/linux/hmm.h and * Documentation/vm/hmm.rst. Short description is we need struct pages for * device memory that is unaddressable (inaccessible) by CPU, so that we can * migrate part of a process memory to device memory. * * When a page is migrated from CPU to device, we set the CPU page table entry * to a special SWP_DEVICE_* entry. */ #ifdef CONFIG_DEVICE_PRIVATE #define SWP_DEVICE_NUM 2 #define SWP_DEVICE_WRITE (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM) #define SWP_DEVICE_READ (MAX_SWAPFILES+SWP_HWPOISON_NUM+SWP_MIGRATION_NUM+1) #else #define SWP_DEVICE_NUM 0 #endif /* * NUMA node memory migration support */ #ifdef CONFIG_MIGRATION #define SWP_MIGRATION_NUM 2 #define SWP_MIGRATION_READ (MAX_SWAPFILES + SWP_HWPOISON_NUM) #define SWP_MIGRATION_WRITE (MAX_SWAPFILES + SWP_HWPOISON_NUM + 1) #else #define SWP_MIGRATION_NUM 0 #endif /* * Handling of hardware poisoned pages with memory corruption. */ #ifdef CONFIG_MEMORY_FAILURE #define SWP_HWPOISON_NUM 1 #define SWP_HWPOISON MAX_SWAPFILES #else #define SWP_HWPOISON_NUM 0 #endif #define MAX_SWAPFILES \ ((1 << MAX_SWAPFILES_SHIFT) - SWP_DEVICE_NUM - \ SWP_MIGRATION_NUM - SWP_HWPOISON_NUM) /* * Magic header for a swap area. The first part of the union is * what the swap magic looks like for the old (limited to 128MB) * swap area format, the second part of the union adds - in the * old reserved area - some extra information. Note that the first * kilobyte is reserved for boot loader or disk label stuff... * * Having the magic at the end of the PAGE_SIZE makes detecting swap * areas somewhat tricky on machines that support multiple page sizes. * For 2.5 we'll probably want to move the magic to just beyond the * bootbits... */ union swap_header { struct { char reserved[PAGE_SIZE - 10]; char magic[10]; /* SWAP-SPACE or SWAPSPACE2 */ } magic; struct { char bootbits[1024]; /* Space for disklabel etc. */ __u32 version; __u32 last_page; __u32 nr_badpages; unsigned char sws_uuid[16]; unsigned char sws_volume[16]; __u32 padding[117]; __u32 badpages[1]; } info; }; /* * current->reclaim_state points to one of these when a task is running * memory reclaim */ struct reclaim_state { unsigned long reclaimed_slab; }; #ifdef __KERNEL__ struct address_space; struct sysinfo; struct writeback_control; struct zone; /* * A swap extent maps a range of a swapfile's PAGE_SIZE pages onto a range of * disk blocks. A list of swap extents maps the entire swapfile. (Where the * term `swapfile' refers to either a blockdevice or an IS_REG file. Apart * from setup, they're handled identically. * * We always assume that blocks are of size PAGE_SIZE. */ struct swap_extent { struct rb_node rb_node; pgoff_t start_page; pgoff_t nr_pages; sector_t start_block; }; /* * Max bad pages in the new format.. */ #define MAX_SWAP_BADPAGES \ ((offsetof(union swap_header, magic.magic) - \ offsetof(union swap_header, info.badpages)) / sizeof(int)) enum { SWP_USED = (1 << 0), /* is slot in swap_info[] used? */ SWP_WRITEOK = (1 << 1), /* ok to write to this swap? */ SWP_DISCARDABLE = (1 << 2), /* blkdev support discard */ SWP_DISCARDING = (1 << 3), /* now discarding a free cluster */ SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ SWP_ACTIVATED = (1 << 7), /* set after swap_activate success */ SWP_FS_OPS = (1 << 8), /* swapfile operations go through fs */ SWP_AREA_DISCARD = (1 << 9), /* single-time swap area discards */ SWP_PAGE_DISCARD = (1 << 10), /* freed swap page-cluster discards */ SWP_STABLE_WRITES = (1 << 11), /* no overwrite PG_writeback pages */ SWP_SYNCHRONOUS_IO = (1 << 12), /* synchronous IO is efficient */ SWP_VALID = (1 << 13), /* swap is valid to be operated on? */ /* add others here before... */ SWP_SCANNING = (1 << 14), /* refcount in scan_swap_map */ }; #define SWAP_CLUSTER_MAX 32UL #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* Bit flag in swap_map */ #define SWAP_HAS_CACHE 0x40 /* Flag page is cached, in first swap_map */ #define COUNT_CONTINUED 0x80 /* Flag swap_map continuation for full count */ /* Special value in first swap_map */ #define SWAP_MAP_MAX 0x3e /* Max count */ #define SWAP_MAP_BAD 0x3f /* Note page is bad */ #define SWAP_MAP_SHMEM 0xbf /* Owned by shmem/tmpfs */ /* Special value in each swap_map continuation */ #define SWAP_CONT_MAX 0x7f /* Max count */ /* * We use this to track usage of a cluster. A cluster is a block of swap disk * space with SWAPFILE_CLUSTER pages long and naturally aligns in disk. All * free clusters are organized into a list. We fetch an entry from the list to * get a free cluster. * * The data field stores next cluster if the cluster is free or cluster usage * counter otherwise. The flags field determines if a cluster is free. This is * protected by swap_info_struct.lock. */ struct swap_cluster_info { spinlock_t lock; /* * Protect swap_cluster_info fields * and swap_info_struct->swap_map * elements correspond to the swap * cluster */ unsigned int data:24; unsigned int flags:8; }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ #define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */ /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from * its own cluster and swapout sequentially. The purpose is to optimize swapout * throughput. */ struct percpu_cluster { struct swap_cluster_info index; /* Current cluster index */ unsigned int next; /* Likely next allocation offset */ }; struct swap_cluster_list { struct swap_cluster_info head; struct swap_cluster_info tail; }; /* * The in-memory structure used to track swap areas. */ struct swap_info_struct { unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ struct swap_cluster_list free_clusters; /* free clusters list */ unsigned int lowest_bit; /* index of first free in swap_map */ unsigned int highest_bit; /* index of last free in swap_map */ unsigned int pages; /* total of usable pages of swap */ unsigned int inuse_pages; /* number of those currently in use */ unsigned int cluster_next; /* likely index for next allocation */ unsigned int cluster_nr; /* countdown to next cluster search */ unsigned int __percpu *cluster_next_cpu; /*percpu index for next allocation */ struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */ struct rb_root swap_extent_root;/* root of the swap extent rbtree */ struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ #ifdef CONFIG_FRONTSWAP unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ atomic_t frontswap_pages; /* frontswap pages in-use counter */ #endif spinlock_t lock; /* * protect map scan related fields like * swap_map, lowest_bit, highest_bit, * inuse_pages, cluster_next, * cluster_nr, lowest_alloc, * highest_alloc, free/discard cluster * list. other fields are only changed * at swapon/swapoff, so are protected * by swap_lock. changing flags need * hold this lock and swap_lock. If * both locks need hold, hold swap_lock * first. */ spinlock_t cont_lock; /* * protect swap count continuation page * list. */ struct work_struct discard_work; /* discard worker */ struct swap_cluster_list discard_clusters; /* discard clusters list */ struct plist_node avail_lists[]; /* * entries in swap_avail_heads, one * entry per node. * Must be last as the number of the * array is nr_node_ids, which is not * a fixed value so have to allocate * dynamically. * And it has to be an array so that * plist_for_each_* can work. */ }; #ifdef CONFIG_64BIT #define SWAP_RA_ORDER_CEILING 5 #else /* Avoid stack overflow, because we need to save part of page table */ #define SWAP_RA_ORDER_CEILING 3 #define SWAP_RA_PTE_CACHE_SIZE (1 << SWAP_RA_ORDER_CEILING) #endif struct vma_swap_readahead { unsigned short win; unsigned short offset; unsigned short nr_pte; #ifdef CONFIG_64BIT pte_t *ptes; #else pte_t ptes[SWAP_RA_PTE_CACHE_SIZE]; #endif }; /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg); void workingset_refault(struct page *page, void *shadow); void workingset_activation(struct page *page); /* Only track the nodes of mappings with shadow entries */ void workingset_update_node(struct xa_node *node); #define mapping_set_update(xas, mapping) do { \ if (!dax_mapping(mapping) && !shmem_mapping(mapping)) \ xas_set_update(xas, workingset_update_node); \ } while (0) /* linux/mm/page_alloc.c */ extern unsigned long totalreserve_pages; extern unsigned long nr_free_buffer_pages(void); /* Definition of global_zone_page_state not available yet */ #define nr_free_pages() global_zone_page_state(NR_FREE_PAGES) /* linux/mm/swap.c */ extern void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); extern void lru_note_cost_page(struct page *); extern void lru_cache_add(struct page *); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_cpu_zone(struct zone *zone); extern void lru_add_drain_all(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void deactivate_page(struct page *page); extern void mark_page_lazyfree(struct page *page); extern void swap_setup(void); extern void lru_cache_add_inactive_or_unevictable(struct page *page, struct vm_area_struct *vma); /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long nr_pages, gfp_t gfp_mask, bool may_swap); extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, unsigned long *nr_scanned); extern unsigned long shrink_all_memory(unsigned long nr_pages); extern int vm_swappiness; extern int remove_mapping(struct address_space *mapping, struct page *page); extern unsigned long reclaim_pages(struct list_head *page_list); #ifdef CONFIG_NUMA extern int node_reclaim_mode; extern int sysctl_min_unmapped_ratio; extern int sysctl_min_slab_ratio; #else #define node_reclaim_mode 0 #endif extern void check_move_unevictable_pages(struct pagevec *pvec); extern int kswapd_run(int nid); extern void kswapd_stop(int nid); #ifdef CONFIG_SWAP #include <linux/blk_types.h> /* for bio_end_io_t */ /* linux/mm/page_io.c */ extern int swap_readpage(struct page *page, bool do_poll); extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern void end_swap_bio_write(struct bio *bio); extern int __swap_writepage(struct page *page, struct writeback_control *wbc, bio_end_io_t end_write_func); extern int swap_set_page_dirty(struct page *page); int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block); int generic_swapfile_activate(struct swap_info_struct *, struct file *, sector_t *); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ #define SWAP_ADDRESS_SPACE_SHIFT 14 #define SWAP_ADDRESS_SPACE_PAGES (1 << SWAP_ADDRESS_SPACE_SHIFT) extern struct address_space *swapper_spaces[]; #define swap_address_space(entry) \ (&swapper_spaces[swp_type(entry)][swp_offset(entry) \ >> SWAP_ADDRESS_SPACE_SHIFT]) extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *page); extern void *get_shadow_from_swap_cache(swp_entry_t entry); extern int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp, void **shadowp); extern void __delete_from_swap_cache(struct page *page, swp_entry_t entry, void *shadow); extern void delete_from_swap_cache(struct page *); extern void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); extern struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); extern struct page *read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool do_poll); extern struct page *__read_swap_cache_async(swp_entry_t, gfp_t, struct vm_area_struct *vma, unsigned long addr, bool *new_page_allocated); extern struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); extern struct page *swapin_readahead(swp_entry_t entry, gfp_t flag, struct vm_fault *vmf); /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { return atomic_long_read(&nr_swap_pages) * 2 < total_swap_pages; } static inline long get_nr_swap_pages(void) { return atomic_long_read(&nr_swap_pages); } extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); extern int swapcache_prepare(swp_entry_t); extern void swap_free(swp_entry_t); extern void swapcache_free_entries(swp_entry_t *entries, int n); extern int free_swap_and_cache(swp_entry_t); int swap_type_of(dev_t device, sector_t offset); int find_first_swap(dev_t *device); extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); extern int __swap_count(swp_entry_t entry); extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); extern bool reuse_swap_page(struct page *, int *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); extern struct swap_info_struct *get_swap_device(swp_entry_t entry); sector_t swap_page_sector(struct page *page); static inline void put_swap_device(struct swap_info_struct *si) { rcu_read_unlock(); } #else /* CONFIG_SWAP */ static inline int swap_readpage(struct page *page, bool do_poll) { return 0; } static inline struct swap_info_struct *swp_swap_info(swp_entry_t entry) { return NULL; } #define swap_address_space(entry) (NULL) #define get_nr_swap_pages() 0L #define total_swap_pages 0L #define total_swapcache_pages() 0UL #define vm_swap_full() 0 #define si_swapinfo(val) \ do { (val)->freeswap = (val)->totalswap = 0; } while (0) /* only sparc can not include linux/pagemap.h in this file * so leave put_page and release_pages undeclared... */ #define free_page_and_swap_cache(page) \ put_page(page) #define free_pages_and_swap_cache(pages, nr) \ release_pages((pages), (nr)); static inline void show_swap_cache_info(void) { } #define free_swap_and_cache(e) ({(is_migration_entry(e) || is_device_private_entry(e));}) #define swapcache_prepare(e) ({(is_migration_entry(e) || is_device_private_entry(e));}) static inline int add_swap_count_continuation(swp_entry_t swp, gfp_t gfp_mask) { return 0; } static inline void swap_shmem_alloc(swp_entry_t swp) { } static inline int swap_duplicate(swp_entry_t swp) { return 0; } static inline void swap_free(swp_entry_t swp) { } static inline void put_swap_page(struct page *page, swp_entry_t swp) { } static inline struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; } static inline struct page *swapin_readahead(swp_entry_t swp, gfp_t gfp_mask, struct vm_fault *vmf) { return NULL; } static inline int swap_writepage(struct page *p, struct writeback_control *wbc) { return 0; } static inline struct page *lookup_swap_cache(swp_entry_t swp, struct vm_area_struct *vma, unsigned long addr) { return NULL; } static inline struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { return find_get_page(mapping, index); } static inline int add_to_swap(struct page *page) { return 0; } static inline void *get_shadow_from_swap_cache(swp_entry_t entry) { return NULL; } static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask, void **shadowp) { return -1; } static inline void __delete_from_swap_cache(struct page *page, swp_entry_t entry, void *shadow) { } static inline void delete_from_swap_cache(struct page *page) { } static inline void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end) { } static inline int page_swapcount(struct page *page) { return 0; } static inline int __swap_count(swp_entry_t entry) { return 0; } static inline int __swp_swapcount(swp_entry_t entry) { return 0; } static inline int swp_swapcount(swp_entry_t entry) { return 0; } #define reuse_swap_page(page, total_map_swapcount) \ (page_trans_huge_mapcount(page, total_map_swapcount) == 1) static inline int try_to_free_swap(struct page *page) { return 0; } static inline swp_entry_t get_swap_page(struct page *page) { swp_entry_t entry; entry.val = 0; return entry; } #endif /* CONFIG_SWAP */ #ifdef CONFIG_THP_SWAP extern int split_swap_cluster(swp_entry_t entry); #else static inline int split_swap_cluster(swp_entry_t entry) { return 0; } #endif #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { /* Cgroup2 doesn't have per-cgroup swappiness */ if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) return vm_swappiness; /* root ? */ if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg)) return vm_swappiness; return memcg->swappiness; } #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) { return vm_swappiness; } #endif #if defined(CONFIG_SWAP) && defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP) extern void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask); #else static inline void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask) { } #endif #ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry); extern int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry); extern void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages); extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); extern bool mem_cgroup_swap_full(struct page *page); #else static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry) { } static inline int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) { return 0; } static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) { } static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { return get_nr_swap_pages(); } static inline bool mem_cgroup_swap_full(struct page *page) { return vm_swap_full(); } #endif #endif /* __KERNEL__*/ #endif /* _LINUX_SWAP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the AF_INET socket handler. * * Version: @(#)sock.h 1.0.4 05/13/93 * * Authors: Ross Biro * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> * Corey Minyard <wf-rch!minyard@relay.EU.net> * Florian La Roche <flla@stud.uni-sb.de> * * Fixes: * Alan Cox : Volatiles in skbuff pointers. See * skbuff comments. May be overdone, * better to prove they can be removed * than the reverse. * Alan Cox : Added a zapped field for tcp to note * a socket is reset and must stay shut up * Alan Cox : New fields for options * Pauline Middelink : identd support * Alan Cox : Eliminate low level recv/recvfrom * David S. Miller : New socket lookup architecture. * Steve Whitehouse: Default routines for sock_ops * Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made * protinfo be just a void pointer, as the * protocol specific parts were moved to * respective headers and ipv4/v6, etc now * use private slabcaches for its socks * Pedro Hortas : New flags field for socket options */ #ifndef _SOCK_H #define _SOCK_H #include <linux/hardirq.h> #include <linux/kernel.h> #include <linux/list.h> #include <linux/list_nulls.h> #include <linux/timer.h> #include <linux/cache.h> #include <linux/bitops.h> #include <linux/lockdep.h> #include <linux/netdevice.h> #include <linux/skbuff.h> /* struct sk_buff */ #include <linux/mm.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <linux/page_counter.h> #include <linux/memcontrol.h> #include <linux/static_key.h> #include <linux/sched.h> #include <linux/wait.h> #include <linux/cgroup-defs.h> #include <linux/rbtree.h> #include <linux/filter.h> #include <linux/rculist_nulls.h> #include <linux/poll.h> #include <linux/sockptr.h> #include <linux/atomic.h> #include <linux/refcount.h> #include <net/dst.h> #include <net/checksum.h> #include <net/tcp_states.h> #include <linux/net_tstamp.h> #include <net/l3mdev.h> /* * This structure really needs to be cleaned up. * Most of it is for TCP, and not used by any of * the other protocols. */ /* Define this to get the SOCK_DBG debugging facility. */ #define SOCK_DEBUGGING #ifdef SOCK_DEBUGGING #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ printk(KERN_DEBUG msg); } while (0) #else /* Validate arguments and do nothing */ static inline __printf(2, 3) void SOCK_DEBUG(const struct sock *sk, const char *msg, ...) { } #endif /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. */ typedef struct { spinlock_t slock; int owned; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics * to the lock validator by explicitly managing * the slock as a lock variant (in addition to * the slock itself): */ #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif } socket_lock_t; struct sock; struct proto; struct net; typedef __u32 __bitwise __portpair; typedef __u64 __bitwise __addrpair; /** * struct sock_common - minimal network layer representation of sockets * @skc_daddr: Foreign IPv4 addr * @skc_rcv_saddr: Bound local IPv4 addr * @skc_addrpair: 8-byte-aligned __u64 union of @skc_daddr & @skc_rcv_saddr * @skc_hash: hash value used with various protocol lookup tables * @skc_u16hashes: two u16 hash values used by UDP lookup tables * @skc_dport: placeholder for inet_dport/tw_dport * @skc_num: placeholder for inet_num/tw_num * @skc_portpair: __u32 union of @skc_dport & @skc_num * @skc_family: network address family * @skc_state: Connection state * @skc_reuse: %SO_REUSEADDR setting * @skc_reuseport: %SO_REUSEPORT setting * @skc_ipv6only: socket is IPV6 only * @skc_net_refcnt: socket is using net ref counting * @skc_bound_dev_if: bound device index if != 0 * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_portaddr_node: second hash linkage for UDP/UDP-Lite protocol * @skc_prot: protocol handlers inside a network family * @skc_net: reference to the network namespace of this socket * @skc_v6_daddr: IPV6 destination address * @skc_v6_rcv_saddr: IPV6 source address * @skc_cookie: socket's cookie value * @skc_node: main hash linkage for various protocol lookup tables * @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol * @skc_tx_queue_mapping: tx queue number for this connection * @skc_rx_queue_mapping: rx queue number for this connection * @skc_flags: place holder for sk_flags * %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * %SO_OOBINLINE settings, %SO_TIMESTAMPING settings * @skc_listener: connection request listener socket (aka rsk_listener) * [union with @skc_flags] * @skc_tw_dr: (aka tw_dr) ptr to &struct inet_timewait_death_row * [union with @skc_flags] * @skc_incoming_cpu: record/match cpu processing incoming packets * @skc_rcv_wnd: (aka rsk_rcv_wnd) TCP receive window size (possibly scaled) * [union with @skc_incoming_cpu] * @skc_tw_rcv_nxt: (aka tw_rcv_nxt) TCP window next expected seq number * [union with @skc_incoming_cpu] * @skc_refcnt: reference count * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. */ struct sock_common { /* skc_daddr and skc_rcv_saddr must be grouped on a 8 bytes aligned * address on 64bit arches : cf INET_MATCH() */ union { __addrpair skc_addrpair; struct { __be32 skc_daddr; __be32 skc_rcv_saddr; }; }; union { unsigned int skc_hash; __u16 skc_u16hashes[2]; }; /* skc_dport && skc_num must be grouped as well */ union { __portpair skc_portpair; struct { __be16 skc_dport; __u16 skc_num; }; }; unsigned short skc_family; volatile unsigned char skc_state; unsigned char skc_reuse:4; unsigned char skc_reuseport:1; unsigned char skc_ipv6only:1; unsigned char skc_net_refcnt:1; int skc_bound_dev_if; union { struct hlist_node skc_bind_node; struct hlist_node skc_portaddr_node; }; struct proto *skc_prot; possible_net_t skc_net; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr skc_v6_daddr; struct in6_addr skc_v6_rcv_saddr; #endif atomic64_t skc_cookie; /* following fields are padding to force * offset(struct sock, sk_refcnt) == 128 on 64bit arches * assuming IPV6 is enabled. We use this padding differently * for different kind of 'sockets' */ union { unsigned long skc_flags; struct sock *skc_listener; /* request_sock */ struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */ }; /* * fields between dontcopy_begin/dontcopy_end * are not copied in sock_copy() */ /* private: */ int skc_dontcopy_begin[0]; /* public: */ union { struct hlist_node skc_node; struct hlist_nulls_node skc_nulls_node; }; unsigned short skc_tx_queue_mapping; #ifdef CONFIG_XPS unsigned short skc_rx_queue_mapping; #endif union { int skc_incoming_cpu; u32 skc_rcv_wnd; u32 skc_tw_rcv_nxt; /* struct tcp_timewait_sock */ }; refcount_t skc_refcnt; /* private: */ int skc_dontcopy_end[0]; union { u32 skc_rxhash; u32 skc_window_clamp; u32 skc_tw_snd_nxt; /* struct tcp_timewait_sock */ }; /* public: */ }; struct bpf_local_storage; /** * struct sock - network layer representation of sockets * @__sk_common: shared layout with inet_timewait_sock * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings * @sk_lock: synchronizer * @sk_kern_sock: True if sock is using kernel lock classes * @sk_rcvbuf: size of receive buffer in bytes * @sk_wq: sock wait queue and async head * @sk_rx_dst: receive input route used by early demux * @sk_dst_cache: destination cache * @sk_dst_pending_confirm: need to confirm neighbour * @sk_policy: flow policy * @sk_rx_skb_cache: cache copy of recently accessed RX skb * @sk_receive_queue: incoming packets * @sk_wmem_alloc: transmit queue bytes committed * @sk_tsq_flags: TCP Small Queues flags * @sk_write_queue: Packet sending queue * @sk_omem_alloc: "o" is "option" or "other" * @sk_wmem_queued: persistent queue size * @sk_forward_alloc: space allocated forward * @sk_napi_id: id of the last napi context to receive data for sk * @sk_ll_usec: usecs to busypoll when there is no data * @sk_allocation: allocation mode * @sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler) * @sk_pacing_status: Pacing status (requested, handled by sch_fq) * @sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE) * @sk_sndbuf: size of send buffer in bytes * @__sk_flags_offset: empty field used to determine location of bitfield * @sk_padding: unused element for alignment * @sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets * @sk_no_check_rx: allow zero checksum in RX packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_route_nocaps: forbidden route capabilities (e.g NETIF_F_GSO_MASK) * @sk_route_forced_caps: static, forced route capabilities * (set in tcp_init_sock()) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) * @sk_gso_max_size: Maximum GSO segment size to build * @sk_gso_max_segs: Maximum number of GSO segments * @sk_pacing_shift: scaling factor for TCP Small Queues * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, * IPV6_ADDRFORM for instance) * @sk_err: last error * @sk_err_soft: errors that don't cause failure but are the cause of a * persistent failure not just 'timed out' * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_uid: user id of owner * @sk_priority: %SO_PRIORITY setting * @sk_type: socket type (%SOCK_STREAM, etc) * @sk_protocol: which protocol this socket belongs in this network family * @sk_peer_pid: &struct pid for this socket's peer * @sk_peer_cred: %SO_PEERCRED setting * @sk_rcvlowat: %SO_RCVLOWAT setting * @sk_rcvtimeo: %SO_RCVTIMEO setting * @sk_sndtimeo: %SO_SNDTIMEO setting * @sk_txhash: computed flow hash for use on transmit * @sk_filter: socket filtering instructions * @sk_timer: sock cleanup timer * @sk_stamp: time stamp of last packet received * @sk_stamp_seq: lock for accessing sk_stamp on 32 bit architectures only * @sk_tsflags: SO_TIMESTAMPING socket options * @sk_tskey: counter to disambiguate concurrent tstamp requests * @sk_zckey: counter to order MSG_ZEROCOPY notifications * @sk_socket: Identd and reporting IO signals * @sk_user_data: RPC layer private data * @sk_frag: cached page frag * @sk_peek_off: current peek_offset value * @sk_send_head: front of stuff to transmit * @tcp_rtx_queue: TCP re-transmit queue [union with @sk_send_head] * @sk_tx_skb_cache: cache copy of recently accessed TX skb * @sk_security: used by security modules * @sk_mark: generic packet mark * @sk_cgrp_data: cgroup data for this cgroup * @sk_memcg: this socket's memory cgroup association * @sk_write_pending: a write to stream socket waits to start * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed * @sk_write_space: callback to indicate there is bf sending space available * @sk_error_report: callback to indicate errors (e.g. %MSG_ERRQUEUE) * @sk_backlog_rcv: callback to process the backlog * @sk_validate_xmit_skb: ptr to an optional validate function * @sk_destruct: called at sock freeing time, i.e. when all refcnt == 0 * @sk_reuseport_cb: reuseport group container * @sk_bpf_storage: ptr to cache and control for bpf_sk_storage * @sk_rcu: used during RCU grace period * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME) * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME * @sk_txtime_report_errors: set report errors mode for SO_TXTIME * @sk_txtime_unused: unused txtime flags */ struct sock { /* * Now struct inet_timewait_sock also uses sock_common, so please just * don't add nothing before this first member (__sk_common) --acme */ struct sock_common __sk_common; #define sk_node __sk_common.skc_node #define sk_nulls_node __sk_common.skc_nulls_node #define sk_refcnt __sk_common.skc_refcnt #define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping #ifdef CONFIG_XPS #define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping #endif #define sk_dontcopy_begin __sk_common.skc_dontcopy_begin #define sk_dontcopy_end __sk_common.skc_dontcopy_end #define sk_hash __sk_common.skc_hash #define sk_portpair __sk_common.skc_portpair #define sk_num __sk_common.skc_num #define sk_dport __sk_common.skc_dport #define sk_addrpair __sk_common.skc_addrpair #define sk_daddr __sk_common.skc_daddr #define sk_rcv_saddr __sk_common.skc_rcv_saddr #define sk_family __sk_common.skc_family #define sk_state __sk_common.skc_state #define sk_reuse __sk_common.skc_reuse #define sk_reuseport __sk_common.skc_reuseport #define sk_ipv6only __sk_common.skc_ipv6only #define sk_net_refcnt __sk_common.skc_net_refcnt #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_bind_node __sk_common.skc_bind_node #define sk_prot __sk_common.skc_prot #define sk_net __sk_common.skc_net #define sk_v6_daddr __sk_common.skc_v6_daddr #define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr #define sk_cookie __sk_common.skc_cookie #define sk_incoming_cpu __sk_common.skc_incoming_cpu #define sk_flags __sk_common.skc_flags #define sk_rxhash __sk_common.skc_rxhash socket_lock_t sk_lock; atomic_t sk_drops; int sk_rcvlowat; struct sk_buff_head sk_error_queue; struct sk_buff *sk_rx_skb_cache; struct sk_buff_head sk_receive_queue; /* * The backlog queue is special, it is always used with * the per-socket spinlock held and requires low latency * access. Therefore we special case it's implementation. * Note : rmem_alloc is in this structure to fill a hole * on 64bit arches, not because its logically part of * backlog. */ struct { atomic_t rmem_alloc; int len; struct sk_buff *head; struct sk_buff *tail; } sk_backlog; #define sk_rmem_alloc sk_backlog.rmem_alloc int sk_forward_alloc; #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int sk_ll_usec; /* ===== mostly read cache line ===== */ unsigned int sk_napi_id; #endif int sk_rcvbuf; struct sk_filter __rcu *sk_filter; union { struct socket_wq __rcu *sk_wq; /* private: */ struct socket_wq *sk_wq_raw; /* public: */ }; #ifdef CONFIG_XFRM struct xfrm_policy __rcu *sk_policy[2]; #endif struct dst_entry *sk_rx_dst; struct dst_entry __rcu *sk_dst_cache; atomic_t sk_omem_alloc; int sk_sndbuf; /* ===== cache line for TX ===== */ int sk_wmem_queued; refcount_t sk_wmem_alloc; unsigned long sk_tsq_flags; union { struct sk_buff *sk_send_head; struct rb_root tcp_rtx_queue; }; struct sk_buff *sk_tx_skb_cache; struct sk_buff_head sk_write_queue; __s32 sk_peek_off; int sk_write_pending; __u32 sk_dst_pending_confirm; u32 sk_pacing_status; /* see enum sk_pacing */ long sk_sndtimeo; struct timer_list sk_timer; __u32 sk_priority; __u32 sk_mark; unsigned long sk_pacing_rate; /* bytes per second */ unsigned long sk_max_pacing_rate; struct page_frag sk_frag; netdev_features_t sk_route_caps; netdev_features_t sk_route_nocaps; netdev_features_t sk_route_forced_caps; int sk_gso_type; unsigned int sk_gso_max_size; gfp_t sk_allocation; __u32 sk_txhash; /* * Because of non atomicity rules, all * changes are protected by socket lock. */ u8 sk_padding : 1, sk_kern_sock : 1, sk_no_check_tx : 1, sk_no_check_rx : 1, sk_userlocks : 4; u8 sk_pacing_shift; u16 sk_type; u16 sk_protocol; u16 sk_gso_max_segs; unsigned long sk_lingertime; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, sk_err_soft; u32 sk_ack_backlog; u32 sk_max_ack_backlog; kuid_t sk_uid; spinlock_t sk_peer_lock; struct pid *sk_peer_pid; const struct cred *sk_peer_cred; long sk_rcvtimeo; ktime_t sk_stamp; #if BITS_PER_LONG==32 seqlock_t sk_stamp_seq; #endif u16 sk_tsflags; u8 sk_shutdown; u32 sk_tskey; atomic_t sk_zckey; u8 sk_clockid; u8 sk_txtime_deadline_mode : 1, sk_txtime_report_errors : 1, sk_txtime_unused : 6; struct socket *sk_socket; void *sk_user_data; #ifdef CONFIG_SECURITY void *sk_security; #endif struct sock_cgroup_data sk_cgrp_data; struct mem_cgroup *sk_memcg; void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk); void (*sk_write_space)(struct sock *sk); void (*sk_error_report)(struct sock *sk); int (*sk_backlog_rcv)(struct sock *sk, struct sk_buff *skb); #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sk_buff* (*sk_validate_xmit_skb)(struct sock *sk, struct net_device *dev, struct sk_buff *skb); #endif void (*sk_destruct)(struct sock *sk); struct sock_reuseport __rcu *sk_reuseport_cb; #ifdef CONFIG_BPF_SYSCALL struct bpf_local_storage __rcu *sk_bpf_storage; #endif struct rcu_head sk_rcu; }; enum sk_pacing { SK_PACING_NONE = 0, SK_PACING_NEEDED = 1, SK_PACING_FQ = 2, }; /* Pointer stored in sk_user_data might not be suitable for copying * when cloning the socket. For instance, it can point to a reference * counted object. sk_user_data bottom bit is set if pointer must not * be copied. */ #define SK_USER_DATA_NOCOPY 1UL #define SK_USER_DATA_BPF 2UL /* Managed by BPF */ #define SK_USER_DATA_PTRMASK ~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF) /** * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied * @sk: socket */ static inline bool sk_user_data_is_nocopy(const struct sock *sk) { return ((uintptr_t)sk->sk_user_data & SK_USER_DATA_NOCOPY); } #define __sk_user_data(sk) ((*((void __rcu **)&(sk)->sk_user_data))) #define rcu_dereference_sk_user_data(sk) \ ({ \ void *__tmp = rcu_dereference(__sk_user_data((sk))); \ (void *)((uintptr_t)__tmp & SK_USER_DATA_PTRMASK); \ }) #define rcu_assign_sk_user_data(sk, ptr) \ ({ \ uintptr_t __tmp = (uintptr_t)(ptr); \ WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), __tmp); \ }) #define rcu_assign_sk_user_data_nocopy(sk, ptr) \ ({ \ uintptr_t __tmp = (uintptr_t)(ptr); \ WARN_ON_ONCE(__tmp & ~SK_USER_DATA_PTRMASK); \ rcu_assign_pointer(__sk_user_data((sk)), \ __tmp | SK_USER_DATA_NOCOPY); \ }) /* * SK_CAN_REUSE and SK_NO_REUSE on a socket mean that the socket is OK * or not whether his port will be reused by someone else. SK_FORCE_REUSE * on a socket means that the socket will reuse everybody else's port * without looking at the other's sk_reuse value. */ #define SK_NO_REUSE 0 #define SK_CAN_REUSE 1 #define SK_FORCE_REUSE 2 int sk_set_peek_off(struct sock *sk, int val); static inline int sk_peek_offset(struct sock *sk, int flags) { if (unlikely(flags & MSG_PEEK)) { return READ_ONCE(sk->sk_peek_off); } return 0; } static inline void sk_peek_offset_bwd(struct sock *sk, int val) { s32 off = READ_ONCE(sk->sk_peek_off); if (unlikely(off >= 0)) { off = max_t(s32, off - val, 0); WRITE_ONCE(sk->sk_peek_off, off); } } static inline void sk_peek_offset_fwd(struct sock *sk, int val) { sk_peek_offset_bwd(sk, -val); } /* * Hashed lists helper routines */ static inline struct sock *sk_entry(const struct hlist_node *node) { return hlist_entry(node, struct sock, sk_node); } static inline struct sock *__sk_head(const struct hlist_head *head) { return hlist_entry(head->first, struct sock, sk_node); } static inline struct sock *sk_head(const struct hlist_head *head) { return hlist_empty(head) ? NULL : __sk_head(head); } static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); } static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) { return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); } static inline struct sock *sk_next(const struct sock *sk) { return hlist_entry_safe(sk->sk_node.next, struct sock, sk_node); } static inline struct sock *sk_nulls_next(const struct sock *sk) { return (!is_a_nulls(sk->sk_nulls_node.next)) ? hlist_nulls_entry(sk->sk_nulls_node.next, struct sock, sk_nulls_node) : NULL; } static inline bool sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); } static inline bool sk_hashed(const struct sock *sk) { return !sk_unhashed(sk); } static inline void sk_node_init(struct hlist_node *node) { node->pprev = NULL; } static inline void sk_nulls_node_init(struct hlist_nulls_node *node) { node->pprev = NULL; } static inline void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); } /* NB: equivalent to hlist_del_init_rcu */ static inline bool __sk_del_node_init(struct sock *sk) { if (sk_hashed(sk)) { __sk_del_node(sk); sk_node_init(&sk->sk_node); return true; } return false; } /* Grab socket reference count. This operation is valid only when sk is ALREADY grabbed f.e. it is found in hash table or a list and the lookup is made under lock preventing hash table modifications. */ static __always_inline void sock_hold(struct sock *sk) { refcount_inc(&sk->sk_refcnt); } /* Ungrab socket in the context, which assumes that socket refcnt cannot hit zero, f.e. it is true in context of any socketcall. */ static __always_inline void __sock_put(struct sock *sk) { refcount_dec(&sk->sk_refcnt); } static inline bool sk_del_node_init(struct sock *sk) { bool rc = __sk_del_node_init(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } #define sk_del_node_init_rcu(sk) sk_del_node_init(sk) static inline bool __sk_nulls_del_node_init_rcu(struct sock *sk) { if (sk_hashed(sk)) { hlist_nulls_del_init_rcu(&sk->sk_nulls_node); return true; } return false; } static inline bool sk_nulls_del_node_init_rcu(struct sock *sk) { bool rc = __sk_nulls_del_node_init_rcu(sk); if (rc) { /* paranoid for a while -acme */ WARN_ON(refcount_read(&sk->sk_refcnt) == 1); __sock_put(sk); } return rc; } static inline void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); } static inline void sk_add_node(struct sock *sk, struct hlist_head *list) { sock_hold(sk); __sk_add_node(sk, list); } static inline void sk_add_node_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport && sk->sk_family == AF_INET6) hlist_add_tail_rcu(&sk->sk_node, list); else hlist_add_head_rcu(&sk->sk_node, list); } static inline void sk_add_node_tail_rcu(struct sock *sk, struct hlist_head *list) { sock_hold(sk); hlist_add_tail_rcu(&sk->sk_node, list); } static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); } static inline void __sk_nulls_add_node_tail_rcu(struct sock *sk, struct hlist_nulls_head *list) { hlist_nulls_add_tail_rcu(&sk->sk_nulls_node, list); } static inline void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) { sock_hold(sk); __sk_nulls_add_node_rcu(sk, list); } static inline void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); } static inline void sk_add_bind_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_bind_node, list); } #define sk_for_each(__sk, list) \ hlist_for_each_entry(__sk, list, sk_node) #define sk_for_each_rcu(__sk, list) \ hlist_for_each_entry_rcu(__sk, list, sk_node) #define sk_nulls_for_each(__sk, node, list) \ hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) #define sk_nulls_for_each_rcu(__sk, node, list) \ hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk) \ hlist_for_each_entry_from(__sk, sk_node) #define sk_nulls_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_safe(__sk, tmp, list) \ hlist_for_each_entry_safe(__sk, tmp, list, sk_node) #define sk_for_each_bound(__sk, list) \ hlist_for_each_entry(__sk, list, sk_bind_node) /** * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset * @tpos: the type * to use as a loop cursor. * @pos: the &struct hlist_node to use as a loop cursor. * @head: the head for your list. * @offset: offset of hlist_node within the struct. * */ #define sk_for_each_entry_offset_rcu(tpos, pos, head, offset) \ for (pos = rcu_dereference(hlist_first_rcu(head)); \ pos != NULL && \ ({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;}); \ pos = rcu_dereference(hlist_next_rcu(pos))) static inline struct user_namespace *sk_user_ns(struct sock *sk) { /* Careful only use this in a context where these parameters * can not change and must all be valid, such as recvmsg from * userspace. */ return sk->sk_socket->file->f_cred->user_ns; } /* Sock flags */ enum sock_flags { SOCK_DEAD, SOCK_DONE, SOCK_URGINLINE, SOCK_KEEPOPEN, SOCK_LINGER, SOCK_DESTROY, SOCK_BROADCAST, SOCK_TIMESTAMP, SOCK_ZAPPED, SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_RX_SOFTWARE, /* %SOF_TIMESTAMPING_RX_SOFTWARE */ SOCK_FASYNC, /* fasync() active */ SOCK_RXQ_OVFL, SOCK_ZEROCOPY, /* buffers from userspace */ SOCK_WIFI_STATUS, /* push wifi status to userspace */ SOCK_NOFCS, /* Tell NIC not to do the Ethernet FCS. * Will use last 4 bytes of packet sent from * user-space instead. */ SOCK_FILTER_LOCKED, /* Filter cannot be changed anymore */ SOCK_SELECT_ERR_QUEUE, /* Wake select on error queue */ SOCK_RCU_FREE, /* wait rcu grace period in sk_destruct() */ SOCK_TXTIME, SOCK_XDP, /* XDP is attached */ SOCK_TSTAMP_NEW, /* Indicates 64 bit timestamps always */ }; #define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE)) static inline void sock_copy_flags(struct sock *nsk, struct sock *osk) { nsk->sk_flags = osk->sk_flags; } static inline void sock_set_flag(struct sock *sk, enum sock_flags flag) { __set_bit(flag, &sk->sk_flags); } static inline void sock_reset_flag(struct sock *sk, enum sock_flags flag) { __clear_bit(flag, &sk->sk_flags); } static inline void sock_valbool_flag(struct sock *sk, enum sock_flags bit, int valbool) { if (valbool) sock_set_flag(sk, bit); else sock_reset_flag(sk, bit); } static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) { return test_bit(flag, &sk->sk_flags); } #ifdef CONFIG_NET DECLARE_STATIC_KEY_FALSE(memalloc_socks_key); static inline int sk_memalloc_socks(void) { return static_branch_unlikely(&memalloc_socks_key); } void __receive_sock(struct file *file); #else static inline int sk_memalloc_socks(void) { return 0; } static inline void __receive_sock(struct file *file) { } #endif static inline gfp_t sk_gfp_mask(const struct sock *sk, gfp_t gfp_mask) { return gfp_mask | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog - 1); } static inline void sk_acceptq_added(struct sock *sk) { WRITE_ONCE(sk->sk_ack_backlog, sk->sk_ack_backlog + 1); } static inline bool sk_acceptq_is_full(const struct sock *sk) { return READ_ONCE(sk->sk_ack_backlog) > READ_ONCE(sk->sk_max_ack_backlog); } /* * Compute minimal free write space needed to queue new packets. */ static inline int sk_stream_min_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_wmem_queued) >> 1; } static inline int sk_stream_wspace(const struct sock *sk) { return READ_ONCE(sk->sk_sndbuf) - READ_ONCE(sk->sk_wmem_queued); } static inline void sk_wmem_queued_add(struct sock *sk, int val) { WRITE_ONCE(sk->sk_wmem_queued, sk->sk_wmem_queued + val); } void sk_stream_write_space(struct sock *sk); /* OOB backlog add */ static inline void __sk_add_backlog(struct sock *sk, struct sk_buff *skb) { /* dont let skb dst not refcounted, we are going to leave rcu lock */ skb_dst_force(skb); if (!sk->sk_backlog.tail) WRITE_ONCE(sk->sk_backlog.head, skb); else sk->sk_backlog.tail->next = skb; WRITE_ONCE(sk->sk_backlog.tail, skb); skb->next = NULL; } /* * Take into account size of receive queue and backlog queue * Do not take into account this skb truesize, * to allow even a single big packet to come. */ static inline bool sk_rcvqueues_full(const struct sock *sk, unsigned int limit) { unsigned int qsize = sk->sk_backlog.len + atomic_read(&sk->sk_rmem_alloc); return qsize > limit; } /* The per-socket spinlock must be held here. */ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *skb, unsigned int limit) { if (sk_rcvqueues_full(sk, limit)) return -ENOBUFS; /* * If the skb was allocated from pfmemalloc reserves, only * allow SOCK_MEMALLOC sockets to use it as this socket is * helping free memory */ if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) return -ENOMEM; __sk_add_backlog(sk, skb); sk->sk_backlog.len += skb->truesize; return 0; } int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { if (sk_memalloc_socks() && skb_pfmemalloc(skb)) return __sk_backlog_rcv(sk, skb); return sk->sk_backlog_rcv(sk, skb); } static inline void sk_incoming_cpu_update(struct sock *sk) { int cpu = raw_smp_processor_id(); if (unlikely(READ_ONCE(sk->sk_incoming_cpu) != cpu)) WRITE_ONCE(sk->sk_incoming_cpu, cpu); } static inline void sock_rps_record_flow_hash(__u32 hash) { #ifdef CONFIG_RPS struct rps_sock_flow_table *sock_flow_table; rcu_read_lock(); sock_flow_table = rcu_dereference(rps_sock_flow_table); rps_record_sock_flow(sock_flow_table, hash); rcu_read_unlock(); #endif } static inline void sock_rps_record_flow(const struct sock *sk) { #ifdef CONFIG_RPS if (static_branch_unlikely(&rfs_needed)) { /* Reading sk->sk_rxhash might incur an expensive cache line * miss. * * TCP_ESTABLISHED does cover almost all states where RFS * might be useful, and is cheaper [1] than testing : * IPv4: inet_sk(sk)->inet_daddr * IPv6: ipv6_addr_any(&sk->sk_v6_daddr) * OR an additional socket flag * [1] : sk_state and sk_prot are in the same cache line. */ if (sk->sk_state == TCP_ESTABLISHED) sock_rps_record_flow_hash(sk->sk_rxhash); } #endif } static inline void sock_rps_save_rxhash(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_RPS if (unlikely(sk->sk_rxhash != skb->hash)) sk->sk_rxhash = skb->hash; #endif } static inline void sock_rps_reset_rxhash(struct sock *sk) { #ifdef CONFIG_RPS sk->sk_rxhash = 0; #endif } #define sk_wait_event(__sk, __timeo, __condition, __wait) \ ({ int __rc; \ release_sock(__sk); \ __rc = __condition; \ if (!__rc) { \ *(__timeo) = wait_woken(__wait, \ TASK_INTERRUPTIBLE, \ *(__timeo)); \ } \ sched_annotate_sleep(); \ lock_sock(__sk); \ __rc = __condition; \ __rc; \ }) int sk_stream_wait_connect(struct sock *sk, long *timeo_p); int sk_stream_wait_memory(struct sock *sk, long *timeo_p); void sk_stream_wait_close(struct sock *sk, long timeo_p); int sk_stream_error(struct sock *sk, int flags, int err); void sk_stream_kill_queues(struct sock *sk); void sk_set_memalloc(struct sock *sk); void sk_clear_memalloc(struct sock *sk); void __sk_flush_backlog(struct sock *sk); static inline bool sk_flush_backlog(struct sock *sk) { if (unlikely(READ_ONCE(sk->sk_backlog.tail))) { __sk_flush_backlog(sk); return true; } return false; } int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb); struct request_sock_ops; struct timewait_sock_ops; struct inet_hashinfo; struct raw_hashinfo; struct smc_hashinfo; struct module; /* * caches using SLAB_TYPESAFE_BY_RCU should let .next pointer from nulls nodes * un-modified. Special care is taken when initializing object to zero. */ static inline void sk_prot_clear_nulls(struct sock *sk, int size) { if (offsetof(struct sock, sk_node.next) != 0) memset(sk, 0, offsetof(struct sock, sk_node.next)); memset(&sk->sk_node.pprev, 0, size - offsetof(struct sock, sk_node.pprev)); } /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface */ struct proto { void (*close)(struct sock *sk, long timeout); int (*pre_connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*connect)(struct sock *sk, struct sockaddr *uaddr, int addr_len); int (*disconnect)(struct sock *sk, int flags); struct sock * (*accept)(struct sock *sk, int flags, int *err, bool kern); int (*ioctl)(struct sock *sk, int cmd, unsigned long arg); int (*init)(struct sock *sk); void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen); int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); void (*keepalive)(struct sock *sk, int valbool); #ifdef CONFIG_COMPAT int (*compat_ioctl)(struct sock *sk, unsigned int cmd, unsigned long arg); #endif int (*sendmsg)(struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct sock *sk, struct msghdr *msg, size_t len, int noblock, int flags, int *addr_len); int (*sendpage)(struct sock *sk, struct page *page, int offset, size_t size, int flags); int (*bind)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*bind_add)(struct sock *sk, struct sockaddr *addr, int addr_len); int (*backlog_rcv) (struct sock *sk, struct sk_buff *skb); void (*release_cb)(struct sock *sk); /* Keeping track of sk's, looking them up, and port selection methods. */ int (*hash)(struct sock *sk); void (*unhash)(struct sock *sk); void (*rehash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); /* Keeping track of sockets in use */ #ifdef CONFIG_PROC_FS unsigned int inuse_idx; #endif bool (*stream_memory_free)(const struct sock *sk, int wake); bool (*stream_memory_read)(const struct sock *sk); /* Memory pressure */ void (*enter_memory_pressure)(struct sock *sk); void (*leave_memory_pressure)(struct sock *sk); atomic_long_t *memory_allocated; /* Current allocated memory. */ struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ unsigned long *memory_pressure; long *sysctl_mem; int *sysctl_wmem; int *sysctl_rmem; u32 sysctl_wmem_offset; u32 sysctl_rmem_offset; int max_header; bool no_autobind; struct kmem_cache *slab; unsigned int obj_size; slab_flags_t slab_flags; unsigned int useroffset; /* Usercopy region offset */ unsigned int usersize; /* Usercopy region size */ unsigned int __percpu *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; union { struct inet_hashinfo *hashinfo; struct udp_table *udp_table; struct raw_hashinfo *raw_hash; struct smc_hashinfo *smc_hash; } h; struct module *owner; char name[32]; struct list_head node; #ifdef SOCK_REFCNT_DEBUG atomic_t socks; #endif int (*diag_destroy)(struct sock *sk, int err); } __randomize_layout; int proto_register(struct proto *prot, int alloc_slab); void proto_unregister(struct proto *prot); int sock_load_diag_module(int family, int protocol); #ifdef SOCK_REFCNT_DEBUG static inline void sk_refcnt_debug_inc(struct sock *sk) { atomic_inc(&sk->sk_prot->socks); } static inline void sk_refcnt_debug_dec(struct sock *sk) { atomic_dec(&sk->sk_prot->socks); printk(KERN_DEBUG "%s socket %p released, %d are still alive\n", sk->sk_prot->name, sk, atomic_read(&sk->sk_prot->socks)); } static inline void sk_refcnt_debug_release(const struct sock *sk) { if (refcount_read(&sk->sk_refcnt) != 1) printk(KERN_DEBUG "Destruction of the %s socket %p delayed, refcnt=%d\n", sk->sk_prot->name, sk, refcount_read(&sk->sk_refcnt)); } #else /* SOCK_REFCNT_DEBUG */ #define sk_refcnt_debug_inc(sk) do { } while (0) #define sk_refcnt_debug_dec(sk) do { } while (0) #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) { if (READ_ONCE(sk->sk_wmem_queued) >= READ_ONCE(sk->sk_sndbuf)) return false; return sk->sk_prot->stream_memory_free ? sk->sk_prot->stream_memory_free(sk, wake) : true; } static inline bool sk_stream_memory_free(const struct sock *sk) { return __sk_stream_memory_free(sk, 0); } static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) { return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && __sk_stream_memory_free(sk, wake); } static inline bool sk_stream_is_writeable(const struct sock *sk) { return __sk_stream_is_writeable(sk, 0); } static inline int sk_under_cgroup_hierarchy(struct sock *sk, struct cgroup *ancestor) { #ifdef CONFIG_SOCK_CGROUP_DATA return cgroup_is_descendant(sock_cgroup_ptr(&sk->sk_cgrp_data), ancestor); #else return -ENOTSUPP; #endif } static inline bool sk_has_memory_pressure(const struct sock *sk) { return sk->sk_prot->memory_pressure != NULL; } static inline bool sk_under_memory_pressure(const struct sock *sk) { if (!sk->sk_prot->memory_pressure) return false; if (mem_cgroup_sockets_enabled && sk->sk_memcg && mem_cgroup_under_socket_pressure(sk->sk_memcg)) return true; return !!*sk->sk_prot->memory_pressure; } static inline long sk_memory_allocated(const struct sock *sk) { return atomic_long_read(sk->sk_prot->memory_allocated); } static inline long sk_memory_allocated_add(struct sock *sk, int amt) { return atomic_long_add_return(amt, sk->sk_prot->memory_allocated); } static inline void sk_memory_allocated_sub(struct sock *sk, int amt) { atomic_long_sub(amt, sk->sk_prot->memory_allocated); } static inline void sk_sockets_allocated_dec(struct sock *sk) { percpu_counter_dec(sk->sk_prot->sockets_allocated); } static inline void sk_sockets_allocated_inc(struct sock *sk) { percpu_counter_inc(sk->sk_prot->sockets_allocated); } static inline u64 sk_sockets_allocated_read_positive(struct sock *sk) { return percpu_counter_read_positive(sk->sk_prot->sockets_allocated); } static inline int proto_sockets_allocated_sum_positive(struct proto *prot) { return percpu_counter_sum_positive(prot->sockets_allocated); } static inline long proto_memory_allocated(struct proto *prot) { return atomic_long_read(prot->memory_allocated); } static inline bool proto_memory_pressure(struct proto *prot) { if (!prot->memory_pressure) return false; return !!*prot->memory_pressure; } #ifdef CONFIG_PROC_FS /* Called with local bh disabled */ void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); int sock_prot_inuse_get(struct net *net, struct proto *proto); int sock_inuse_get(struct net *net); #else static inline void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc) { } #endif /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. */ static inline int __sk_prot_rehash(struct sock *sk) { sk->sk_prot->unhash(sk); return sk->sk_prot->hash(sk); } /* About 10 seconds */ #define SOCK_DESTROY_TIME (10*HZ) /* Sockets 0-1023 can't be bound to unless you are superuser */ #define PROT_SOCK 1024 #define SHUTDOWN_MASK 3 #define RCV_SHUTDOWN 1 #define SEND_SHUTDOWN 2 #define SOCK_SNDBUF_LOCK 1 #define SOCK_RCVBUF_LOCK 2 #define SOCK_BINDADDR_LOCK 4 #define SOCK_BINDPORT_LOCK 8 struct socket_alloc { struct socket socket; struct inode vfs_inode; }; static inline struct socket *SOCKET_I(struct inode *inode) { return &container_of(inode, struct socket_alloc, vfs_inode)->socket; } static inline struct inode *SOCK_INODE(struct socket *socket) { return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } /* * Functions for memory accounting */ int __sk_mem_raise_allocated(struct sock *sk, int size, int amt, int kind); int __sk_mem_schedule(struct sock *sk, int size, int kind); void __sk_mem_reduce_allocated(struct sock *sk, int amount); void __sk_mem_reclaim(struct sock *sk, int amount); /* We used to have PAGE_SIZE here, but systems with 64KB pages * do not necessarily have 16x time more memory than 4KB ones. */ #define SK_MEM_QUANTUM 4096 #define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) #define SK_MEM_SEND 0 #define SK_MEM_RECV 1 /* sysctl_mem values are in pages, we convert them in SK_MEM_QUANTUM units */ static inline long sk_prot_mem_limits(const struct sock *sk, int index) { long val = sk->sk_prot->sysctl_mem[index]; #if PAGE_SIZE > SK_MEM_QUANTUM val <<= PAGE_SHIFT - SK_MEM_QUANTUM_SHIFT; #elif PAGE_SIZE < SK_MEM_QUANTUM val >>= SK_MEM_QUANTUM_SHIFT - PAGE_SHIFT; #endif return val; } static inline int sk_mem_pages(int amt) { return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; } static inline bool sk_has_account(struct sock *sk) { /* return true if protocol supports memory accounting */ return !!sk->sk_prot->memory_allocated; } static inline bool sk_wmem_schedule(struct sock *sk, int size) { if (!sk_has_account(sk)) return true; return size <= sk->sk_forward_alloc || __sk_mem_schedule(sk, size, SK_MEM_SEND); } static inline bool sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size) { if (!sk_has_account(sk)) return true; return size <= sk->sk_forward_alloc || __sk_mem_schedule(sk, size, SK_MEM_RECV) || skb_pfmemalloc(skb); } static inline void sk_mem_reclaim(struct sock *sk) { if (!sk_has_account(sk)) return; if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) __sk_mem_reclaim(sk, sk->sk_forward_alloc); } static inline void sk_mem_reclaim_partial(struct sock *sk) { if (!sk_has_account(sk)) return; if (sk->sk_forward_alloc > SK_MEM_QUANTUM) __sk_mem_reclaim(sk, sk->sk_forward_alloc - 1); } static inline void sk_mem_charge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk->sk_forward_alloc -= size; } static inline void sk_mem_uncharge(struct sock *sk, int size) { if (!sk_has_account(sk)) return; sk->sk_forward_alloc += size; /* Avoid a possible overflow. * TCP send queues can make this happen, if sk_mem_reclaim() * is not called and more than 2 GBytes are released at once. * * If we reach 2 MBytes, reclaim 1 MBytes right now, there is * no need to hold that much forward allocation anyway. */ if (unlikely(sk->sk_forward_alloc >= 1 << 21)) __sk_mem_reclaim(sk, 1 << 20); } DECLARE_STATIC_KEY_FALSE(tcp_tx_skb_cache_key); static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) { sk_wmem_queued_add(sk, -skb->truesize); sk_mem_uncharge(sk, skb->truesize); if (static_branch_unlikely(&tcp_tx_skb_cache_key) && !sk->sk_tx_skb_cache && !skb_cloned(skb)) { skb_ext_reset(skb); skb_zcopy_clear(skb, true); sk->sk_tx_skb_cache = skb; return; } __kfree_skb(skb); } static inline void sock_release_ownership(struct sock *sk) { if (sk->sk_lock.owned) { sk->sk_lock.owned = 0; /* The sk_lock has mutex_unlock() semantics: */ mutex_release(&sk->sk_lock.dep_map, _RET_IP_); } } /* * Macro so as to not evaluate some arguments when * lockdep is not enabled. * * Mark both the sk_lock and the sk_lock.slock as a * per-address-family lock class. */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ sizeof((sk)->sk_lock)); \ lockdep_set_class_and_name(&(sk)->sk_lock.slock, \ (skey), (sname)); \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) #ifdef CONFIG_LOCKDEP static inline bool lockdep_sock_is_held(const struct sock *sk) { return lockdep_is_held(&sk->sk_lock) || lockdep_is_held(&sk->sk_lock.slock); } #endif void lock_sock_nested(struct sock *sk, int subclass); static inline void lock_sock(struct sock *sk) { lock_sock_nested(sk, 0); } void __release_sock(struct sock *sk); void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) #define bh_lock_sock_nested(__sk) \ spin_lock_nested(&((__sk)->sk_lock.slock), \ SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) bool lock_sock_fast(struct sock *sk); /** * unlock_sock_fast - complement of lock_sock_fast * @sk: socket * @slow: slow mode * * fast unlock socket for user context. * If slow mode is on, we call regular release_sock() */ static inline void unlock_sock_fast(struct sock *sk, bool slow) { if (slow) release_sock(sk); else spin_unlock_bh(&sk->sk_lock.slock); } /* Used by processes to "lock" a socket state, so that * interrupts and bottom half handlers won't change it * from under us. It essentially blocks any incoming * packets, so that we won't get any new data or any * packets that change the state of the socket. * * While locked, BH processing will add new packets to * the backlog queue. This queue is processed by the * owner of the socket lock right before it is released. * * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ static inline void sock_owned_by_me(const struct sock *sk) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(!lockdep_sock_is_held(sk) && debug_locks); #endif } static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); return sk->sk_lock.owned; } static inline bool sock_owned_by_user_nocheck(const struct sock *sk) { return sk->sk_lock.owned; } /* no reclassification while locks are held */ static inline bool sock_allow_reclassification(const struct sock *csk) { struct sock *sk = (struct sock *)csk; return !sk->sk_lock.owned && !spin_is_locked(&sk->sk_lock.slock); } struct sock *sk_alloc(struct net *net, int family, gfp_t priority, struct proto *prot, int kern); void sk_free(struct sock *sk); void sk_destruct(struct sock *sk); struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority); void sk_free_unlock_clone(struct sock *sk); struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, gfp_t priority); void __sock_wfree(struct sk_buff *skb); void sock_wfree(struct sk_buff *skb); struct sk_buff *sock_omalloc(struct sock *sk, unsigned long size, gfp_t priority); void skb_orphan_partial(struct sk_buff *skb); void sock_rfree(struct sk_buff *skb); void sock_efree(struct sk_buff *skb); #ifdef CONFIG_INET void sock_edemux(struct sk_buff *skb); void sock_pfree(struct sk_buff *skb); #else #define sock_edemux sock_efree #endif int sock_setsockopt(struct socket *sock, int level, int op, sockptr_t optval, unsigned int optlen); int sock_getsockopt(struct socket *sock, int level, int op, char __user *optval, int __user *optlen); int sock_gettstamp(struct socket *sock, void __user *userstamp, bool timeval, bool time32); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode); struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, unsigned long data_len, int noblock, int *errcode, int max_page_order); void *sock_kmalloc(struct sock *sk, int size, gfp_t priority); void sock_kfree_s(struct sock *sk, void *mem, int size); void sock_kzfree_s(struct sock *sk, void *mem, int size); void sk_send_sigurg(struct sock *sk); struct sockcm_cookie { u64 transmit_time; u32 mark; u16 tsflags; }; static inline void sockcm_init(struct sockcm_cookie *sockc, const struct sock *sk) { *sockc = (struct sockcm_cookie) { .tsflags = sk->sk_tsflags }; } int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, struct sockcm_cookie *sockc); int sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct sockcm_cookie *sockc); /* * Functions to fill in entries in struct proto_ops when a protocol * does not implement a particular function. */ int sock_no_bind(struct socket *, struct sockaddr *, int); int sock_no_connect(struct socket *, struct sockaddr *, int, int); int sock_no_socketpair(struct socket *, struct socket *); int sock_no_accept(struct socket *, struct socket *, int, bool); int sock_no_getname(struct socket *, struct sockaddr *, int); int sock_no_ioctl(struct socket *, unsigned int, unsigned long); int sock_no_listen(struct socket *, int); int sock_no_shutdown(struct socket *, int); int sock_no_sendmsg(struct socket *, struct msghdr *, size_t); int sock_no_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t len); int sock_no_recvmsg(struct socket *, struct msghdr *, size_t, int); int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma); ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); ssize_t sock_no_sendpage_locked(struct sock *sk, struct page *page, int offset, size_t size, int flags); /* * Functions to fill in entries in struct proto_ops when a protocol * uses the inet style. */ int sock_common_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen); int sock_common_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int sock_common_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen); void sk_common_release(struct sock *sk); /* * Default socket callbacks and setup code */ /* Initialise core socket variables */ void sock_init_data(struct socket *sock, struct sock *sk); /* * Socket reference counting postulates. * * * Each user of socket SHOULD hold a reference count. * * Each access point to socket (an hash table bucket, reference from a list, * running timer, skb in flight MUST hold a reference count. * * When reference count hits 0, it means it will never increase back. * * When reference count hits 0, it means that no references from * outside exist to this socket and current process on current CPU * is last user and may/should destroy this socket. * * sk_free is called from any context: process, BH, IRQ. When * it is called, socket has no references from outside -> sk_free * may release descendant resources allocated by the socket, but * to the time when it is called, socket is NOT referenced by any * hash tables, lists etc. * * Packets, delivered from outside (from network or from another process) * and enqueued on receive/error queues SHOULD NOT grab reference count, * when they sit in queue. Otherwise, packets will leak to hole, when * socket is looked up by one cpu and unhasing is made by another CPU. * It is true for udp/raw, netlink (leak to receive and error queues), tcp * (leak to backlog). Packet socket does all the processing inside * BR_NETPROTO_LOCK, so that it has not this race condition. UNIX sockets * use separate SMP lock, so that they are prone too. */ /* Ungrab socket and destroy it, if it was the last reference. */ static inline void sock_put(struct sock *sk) { if (refcount_dec_and_test(&sk->sk_refcnt)) sk_free(sk); } /* Generic version of sock_put(), dealing with all sockets * (TCP_TIMEWAIT, TCP_NEW_SYN_RECV, ESTABLISHED...) */ void sock_gen_put(struct sock *sk); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, unsigned int trim_cap, bool refcounted); static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) { return __sk_receive_skb(sk, skb, nested, 1, true); } static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) { /* sk_tx_queue_mapping accept only upto a 16-bit value */ if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX)) return; sk->sk_tx_queue_mapping = tx_queue; } #define NO_QUEUE_MAPPING USHRT_MAX static inline void sk_tx_queue_clear(struct sock *sk) { sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING; } static inline int sk_tx_queue_get(const struct sock *sk) { if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING) return sk->sk_tx_queue_mapping; return -1; } static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_XPS if (skb_rx_queue_recorded(skb)) { u16 rx_queue = skb_get_rx_queue(skb); if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING)) return; sk->sk_rx_queue_mapping = rx_queue; } #endif } static inline void sk_rx_queue_clear(struct sock *sk) { #ifdef CONFIG_XPS sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING; #endif } #ifdef CONFIG_XPS static inline int sk_rx_queue_get(const struct sock *sk) { if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING) return sk->sk_rx_queue_mapping; return -1; } #endif static inline void sk_set_socket(struct sock *sk, struct socket *sock) { sk->sk_socket = sock; } static inline wait_queue_head_t *sk_sleep(struct sock *sk) { BUILD_BUG_ON(offsetof(struct socket_wq, wait) != 0); return &rcu_dereference_raw(sk->sk_wq)->wait; } /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, * we do not release it in this function, because protocol * probably wants some additional cleanups or even continuing * to work with this socket (TCP). */ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); sk_set_socket(sk, NULL); sk->sk_wq = NULL; write_unlock_bh(&sk->sk_callback_lock); } static inline void sock_graft(struct sock *sk, struct socket *parent) { WARN_ON(parent->sk); write_lock_bh(&sk->sk_callback_lock); rcu_assign_pointer(sk->sk_wq, &parent->wq); parent->sk = sk; sk_set_socket(sk, parent); sk->sk_uid = SOCK_INODE(parent)->i_uid; security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } kuid_t sock_i_uid(struct sock *sk); unsigned long sock_i_ino(struct sock *sk); static inline kuid_t sock_net_uid(const struct net *net, const struct sock *sk) { return sk ? sk->sk_uid : make_kuid(net->user_ns, 0); } static inline u32 net_tx_rndhash(void) { u32 v = prandom_u32(); return v ?: 1; } static inline void sk_set_txhash(struct sock *sk) { /* This pairs with READ_ONCE() in skb_set_hash_from_sk() */ WRITE_ONCE(sk->sk_txhash, net_tx_rndhash()); } static inline bool sk_rethink_txhash(struct sock *sk) { if (sk->sk_txhash) { sk_set_txhash(sk); return true; } return false; } static inline struct dst_entry * __sk_dst_get(struct sock *sk) { return rcu_dereference_check(sk->sk_dst_cache, lockdep_sock_is_held(sk)); } static inline struct dst_entry * sk_dst_get(struct sock *sk) { struct dst_entry *dst; rcu_read_lock(); dst = rcu_dereference(sk->sk_dst_cache); if (dst && !atomic_inc_not_zero(&dst->__refcnt)) dst = NULL; rcu_read_unlock(); return dst; } static inline void __dst_negative_advice(struct sock *sk) { struct dst_entry *ndst, *dst = __sk_dst_get(sk); if (dst && dst->ops->negative_advice) { ndst = dst->ops->negative_advice(dst); if (ndst != dst) { rcu_assign_pointer(sk->sk_dst_cache, ndst); sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; } } } static inline void dst_negative_advice(struct sock *sk) { sk_rethink_txhash(sk); __dst_negative_advice(sk); } static inline void __sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; old_dst = rcu_dereference_protected(sk->sk_dst_cache, lockdep_sock_is_held(sk)); rcu_assign_pointer(sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void sk_dst_set(struct sock *sk, struct dst_entry *dst) { struct dst_entry *old_dst; sk_tx_queue_clear(sk); sk->sk_dst_pending_confirm = 0; old_dst = xchg((__force struct dst_entry **)&sk->sk_dst_cache, dst); dst_release(old_dst); } static inline void __sk_dst_reset(struct sock *sk) { __sk_dst_set(sk, NULL); } static inline void sk_dst_reset(struct sock *sk) { sk_dst_set(sk, NULL); } struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie); struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie); static inline void sk_dst_confirm(struct sock *sk) { if (!READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 1); } static inline void sock_confirm_neigh(struct sk_buff *skb, struct neighbour *n) { if (skb_get_dst_pending_confirm(skb)) { struct sock *sk = skb->sk; unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); if (sk && READ_ONCE(sk->sk_dst_pending_confirm)) WRITE_ONCE(sk->sk_dst_pending_confirm, 0); } } bool sk_mc_loop(struct sock *sk); static inline bool sk_can_gso(const struct sock *sk) { return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline void sk_nocaps_add(struct sock *sk, netdev_features_t flags) { sk->sk_route_nocaps |= flags; sk->sk_route_caps &= ~flags; } static inline int skb_do_copy_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, char *to, int copy, int offset) { if (skb->ip_summed == CHECKSUM_NONE) { __wsum csum = 0; if (!csum_and_copy_from_iter_full(to, copy, &csum, from)) return -EFAULT; skb->csum = csum_block_add(skb->csum, csum, offset); } else if (sk->sk_route_caps & NETIF_F_NOCACHE_COPY) { if (!copy_from_iter_full_nocache(to, copy, from)) return -EFAULT; } else if (!copy_from_iter_full(to, copy, from)) return -EFAULT; return 0; } static inline int skb_add_data_nocache(struct sock *sk, struct sk_buff *skb, struct iov_iter *from, int copy) { int err, offset = skb->len; err = skb_do_copy_data_nocache(sk, skb, from, skb_put(skb, copy), copy, offset); if (err) __skb_trim(skb, offset); return err; } static inline int skb_copy_to_page_nocache(struct sock *sk, struct iov_iter *from, struct sk_buff *skb, struct page *page, int off, int copy) { int err; err = skb_do_copy_data_nocache(sk, skb, from, page_address(page) + off, copy, skb->len); if (err) return err; skb->len += copy; skb->data_len += copy; skb->truesize += copy; sk_wmem_queued_add(sk, copy); sk_mem_charge(sk, copy); return 0; } /** * sk_wmem_alloc_get - returns write allocations * @sk: socket * * Return: sk_wmem_alloc minus initial offset of one */ static inline int sk_wmem_alloc_get(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) - 1; } /** * sk_rmem_alloc_get - returns read allocations * @sk: socket * * Return: sk_rmem_alloc */ static inline int sk_rmem_alloc_get(const struct sock *sk) { return atomic_read(&sk->sk_rmem_alloc); } /** * sk_has_allocations - check if allocations are outstanding * @sk: socket * * Return: true if socket has write or read allocations */ static inline bool sk_has_allocations(const struct sock *sk) { return sk_wmem_alloc_get(sk) || sk_rmem_alloc_get(sk); } /** * skwq_has_sleeper - check if there are any waiting processes * @wq: struct socket_wq * * Return: true if socket_wq has waiting processes * * The purpose of the skwq_has_sleeper and sock_poll_wait is to wrap the memory * barrier call. They were added due to the race found within the tcp code. * * Consider following tcp code paths:: * * CPU1 CPU2 * sys_select receive packet * ... ... * __add_wait_queue update tp->rcv_nxt * ... ... * tp->rcv_nxt check sock_def_readable * ... { * schedule rcu_read_lock(); * wq = rcu_dereference(sk->sk_wq); * if (wq && waitqueue_active(&wq->wait)) * wake_up_interruptible(&wq->wait) * ... * } * * The race for tcp fires when the __add_wait_queue changes done by CPU1 stay * in its cache, and so does the tp->rcv_nxt update on CPU2 side. The CPU1 * could then endup calling schedule and sleep forever if there are no more * data on the socket. * */ static inline bool skwq_has_sleeper(struct socket_wq *wq) { return wq && wq_has_sleeper(&wq->wait); } /** * sock_poll_wait - place memory barrier behind the poll_wait call. * @filp: file * @sock: socket to wait on * @p: poll_table * * See the comments in the wq_has_sleeper function. */ static inline void sock_poll_wait(struct file *filp, struct socket *sock, poll_table *p) { if (!poll_does_not_wait(p)) { poll_wait(filp, &sock->wq.wait, p); /* We need to be sure we are in sync with the * socket flags modification. * * This memory barrier is paired in the wq_has_sleeper. */ smp_mb(); } } static inline void skb_set_hash_from_sk(struct sk_buff *skb, struct sock *sk) { /* This pairs with WRITE_ONCE() in sk_set_txhash() */ u32 txhash = READ_ONCE(sk->sk_txhash); if (txhash) { skb->l4_hash = 1; skb->hash = txhash; } } void skb_set_owner_w(struct sk_buff *skb, struct sock *sk); /* * Queue a received datagram if it will fit. Stream and sequenced * protocols can't normally use this as they need to fit buffers in * and play with them. * * Inlined as it's very short and called for pretty much every * packet ever received. */ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) { skb_orphan(skb); skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); sk_mem_charge(sk, skb->truesize); } static inline __must_check bool skb_set_owner_sk_safe(struct sk_buff *skb, struct sock *sk) { if (sk && refcount_inc_not_zero(&sk->sk_refcnt)) { skb_orphan(skb); skb->destructor = sock_efree; skb->sk = sk; return true; } return false; } void sk_reset_timer(struct sock *sk, struct timer_list *timer, unsigned long expires); void sk_stop_timer(struct sock *sk, struct timer_list *timer); void sk_stop_timer_sync(struct sock *sk, struct timer_list *timer); int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue, struct sk_buff *skb, unsigned int flags, void (*destructor)(struct sock *sk, struct sk_buff *skb)); int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb); int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb); struct sk_buff *sock_dequeue_err_skb(struct sock *sk); /* * Recover an error report and clear atomically */ static inline int sock_error(struct sock *sk) { int err; /* Avoid an atomic operation for the common case. * This is racy since another cpu/thread can change sk_err under us. */ if (likely(data_race(!sk->sk_err))) return 0; err = xchg(&sk->sk_err, 0); return -err; } static inline unsigned long sock_wspace(struct sock *sk) { int amt = 0; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { amt = sk->sk_sndbuf - refcount_read(&sk->sk_wmem_alloc); if (amt < 0) amt = 0; } return amt; } /* Note: * We use sk->sk_wq_raw, from contexts knowing this * pointer is not NULL and cannot disappear/change. */ static inline void sk_set_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; set_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_clear_bit(int nr, struct sock *sk) { if ((nr == SOCKWQ_ASYNC_NOSPACE || nr == SOCKWQ_ASYNC_WAITDATA) && !sock_flag(sk, SOCK_FASYNC)) return; clear_bit(nr, &sk->sk_wq_raw->flags); } static inline void sk_wake_async(const struct sock *sk, int how, int band) { if (sock_flag(sk, SOCK_FASYNC)) { rcu_read_lock(); sock_wake_async(rcu_dereference(sk->sk_wq), how, band); rcu_read_unlock(); } } /* Since sk_{r,w}mem_alloc sums skb->truesize, even a small frame might * need sizeof(sk_buff) + MTU + padding, unless net driver perform copybreak. * Note: for send buffers, TCP works better if we can build two skbs at * minimum. */ #define TCP_SKB_MIN_TRUESIZE (2048 + SKB_DATA_ALIGN(sizeof(struct sk_buff))) #define SOCK_MIN_SNDBUF (TCP_SKB_MIN_TRUESIZE * 2) #define SOCK_MIN_RCVBUF TCP_SKB_MIN_TRUESIZE static inline void sk_stream_moderate_sndbuf(struct sock *sk) { u32 val; if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) return; val = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); WRITE_ONCE(sk->sk_sndbuf, max_t(u32, val, SOCK_MIN_SNDBUF)); } struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp, bool force_schedule); /** * sk_page_frag - return an appropriate page_frag * @sk: socket * * Use the per task page_frag instead of the per socket one for * optimization when we know that we're in process context and own * everything that's associated with %current. * * Both direct reclaim and page faults can nest inside other * socket operations and end up recursing into sk_page_frag() * while it's already in use: explicitly avoid task page_frag * usage if the caller is potentially doing any of them. * This assumes that page fault handlers use the GFP_NOFS flags. * * Return: a per task page_frag if context allows that, * otherwise a per socket one. */ static inline struct page_frag *sk_page_frag(struct sock *sk) { if ((sk->sk_allocation & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC | __GFP_FS)) == (__GFP_DIRECT_RECLAIM | __GFP_FS)) return &current->task_frag; return &sk->sk_frag; } bool sk_page_frag_refill(struct sock *sk, struct page_frag *pfrag); /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline bool sock_writeable(const struct sock *sk) { return refcount_read(&sk->sk_wmem_alloc) < (READ_ONCE(sk->sk_sndbuf) >> 1); } static inline gfp_t gfp_any(void) { return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_rcvtimeo; } static inline long sock_sndtimeo(const struct sock *sk, bool noblock) { return noblock ? 0 : sk->sk_sndtimeo; } static inline int sock_rcvlowat(const struct sock *sk, int waitall, int len) { int v = waitall ? len : min_t(int, READ_ONCE(sk->sk_rcvlowat), len); return v ?: 1; } /* Alas, with timeout socket operations are not restartable. * Compare this to poll(). */ static inline int sock_intr_errno(long timeo) { return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } struct sock_skb_cb { u32 dropcount; }; /* Store sock_skb_cb at the end of skb->cb[] so protocol families * using skb->cb[] would keep using it directly and utilize its * alignement guarantee. */ #define SOCK_SKB_CB_OFFSET ((sizeof_field(struct sk_buff, cb) - \ sizeof(struct sock_skb_cb))) #define SOCK_SKB_CB(__skb) ((struct sock_skb_cb *)((__skb)->cb + \ SOCK_SKB_CB_OFFSET)) #define sock_skb_cb_check_size(size) \ BUILD_BUG_ON((size) > SOCK_SKB_CB_OFFSET) static inline void sock_skb_set_dropcount(const struct sock *sk, struct sk_buff *skb) { SOCK_SKB_CB(skb)->dropcount = sock_flag(sk, SOCK_RXQ_OVFL) ? atomic_read(&sk->sk_drops) : 0; } static inline void sk_drops_add(struct sock *sk, const struct sk_buff *skb) { int segs = max_t(u16, 1, skb_shinfo(skb)->gso_segs); atomic_add(segs, &sk->sk_drops); } static inline ktime_t sock_read_timestamp(struct sock *sk) { #if BITS_PER_LONG==32 unsigned int seq; ktime_t kt; do { seq = read_seqbegin(&sk->sk_stamp_seq); kt = sk->sk_stamp; } while (read_seqretry(&sk->sk_stamp_seq, seq)); return kt; #else return READ_ONCE(sk->sk_stamp); #endif } static inline void sock_write_timestamp(struct sock *sk, ktime_t kt) { #if BITS_PER_LONG==32 write_seqlock(&sk->sk_stamp_seq); sk->sk_stamp = kt; write_sequnlock(&sk->sk_stamp_seq); #else WRITE_ONCE(sk->sk_stamp, kt); #endif } void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); void __sock_recv_wifi_status(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); static inline void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { ktime_t kt = skb->tstamp; struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); /* * generate control messages if * - receive time stamping in software requested * - software time stamp available and wanted * - hardware time stamps available and wanted */ if (sock_flag(sk, SOCK_RCVTSTAMP) || (sk->sk_tsflags & SOF_TIMESTAMPING_RX_SOFTWARE) || (kt && sk->sk_tsflags & SOF_TIMESTAMPING_SOFTWARE) || (hwtstamps->hwtstamp && (sk->sk_tsflags & SOF_TIMESTAMPING_RAW_HARDWARE))) __sock_recv_timestamp(msg, sk, skb); else sock_write_timestamp(sk, kt); if (sock_flag(sk, SOCK_WIFI_STATUS) && skb->wifi_acked_valid) __sock_recv_wifi_status(msg, sk, skb); } void __sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb); #define SK_DEFAULT_STAMP (-1L * NSEC_PER_SEC) static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { #define FLAGS_TS_OR_DROPS ((1UL << SOCK_RXQ_OVFL) | \ (1UL << SOCK_RCVTSTAMP)) #define TSFLAGS_ANY (SOF_TIMESTAMPING_SOFTWARE | \ SOF_TIMESTAMPING_RAW_HARDWARE) if (sk->sk_flags & FLAGS_TS_OR_DROPS || sk->sk_tsflags & TSFLAGS_ANY) __sock_recv_ts_and_drops(msg, sk, skb); else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) sock_write_timestamp(sk, skb->tstamp); else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) sock_write_timestamp(sk, 0); } void __sock_tx_timestamp(__u16 tsflags, __u8 *tx_flags); /** * _sock_tx_timestamp - checks whether the outgoing packet is to be time stamped * @sk: socket sending this packet * @tsflags: timestamping flags to use * @tx_flags: completed with instructions for time stamping * @tskey: filled in with next sk_tskey (not for TCP, which uses seqno) * * Note: callers should take care of initial ``*tx_flags`` value (usually 0) */ static inline void _sock_tx_timestamp(struct sock *sk, __u16 tsflags, __u8 *tx_flags, __u32 *tskey) { if (unlikely(tsflags)) { __sock_tx_timestamp(tsflags, tx_flags); if (tsflags & SOF_TIMESTAMPING_OPT_ID && tskey && tsflags & SOF_TIMESTAMPING_TX_RECORD_MASK) *tskey = sk->sk_tskey++; } if (unlikely(sock_flag(sk, SOCK_WIFI_STATUS))) *tx_flags |= SKBTX_WIFI_STATUS; } static inline void sock_tx_timestamp(struct sock *sk, __u16 tsflags, __u8 *tx_flags) { _sock_tx_timestamp(sk, tsflags, tx_flags, NULL); } static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) { _sock_tx_timestamp(skb->sk, tsflags, &skb_shinfo(skb)->tx_flags, &skb_shinfo(skb)->tskey); } DECLARE_STATIC_KEY_FALSE(tcp_rx_skb_cache_key); /** * sk_eat_skb - Release a skb if it is no longer needed * @sk: socket to eat this skb from * @skb: socket buffer to eat * * This routine must be called with interrupts disabled or with the socket * locked so that the sk_buff queue operation is ok. */ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb) { __skb_unlink(skb, &sk->sk_receive_queue); if (static_branch_unlikely(&tcp_rx_skb_cache_key) && !sk->sk_rx_skb_cache) { sk->sk_rx_skb_cache = skb; skb_orphan(skb); return; } __kfree_skb(skb); } static inline struct net *sock_net(const struct sock *sk) { return read_pnet(&sk->sk_net); } static inline void sock_net_set(struct sock *sk, struct net *net) { write_pnet(&sk->sk_net, net); } static inline bool skb_sk_is_prefetched(struct sk_buff *skb) { #ifdef CONFIG_INET return skb->destructor == sock_pfree; #else return false; #endif /* CONFIG_INET */ } /* This helper checks if a socket is a full socket, * ie _not_ a timewait or request socket. */ static inline bool sk_fullsock(const struct sock *sk) { return (1 << sk->sk_state) & ~(TCPF_TIME_WAIT | TCPF_NEW_SYN_RECV); } static inline bool sk_is_refcounted(struct sock *sk) { /* Only full sockets have sk->sk_flags. */ return !sk_fullsock(sk) || !sock_flag(sk, SOCK_RCU_FREE); } /** * skb_steal_sock - steal a socket from an sk_buff * @skb: sk_buff to steal the socket from * @refcounted: is set to true if the socket is reference-counted */ static inline struct sock * skb_steal_sock(struct sk_buff *skb, bool *refcounted) { if (skb->sk) { struct sock *sk = skb->sk; *refcounted = true; if (skb_sk_is_prefetched(skb)) *refcounted = sk_is_refcounted(sk); skb->destructor = NULL; skb->sk = NULL; return sk; } *refcounted = false; return NULL; } /* Checks if this SKB belongs to an HW offloaded socket * and whether any SW fallbacks are required based on dev. * Check decrypted mark in case skb_orphan() cleared socket. */ static inline struct sk_buff *sk_validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) { #ifdef CONFIG_SOCK_VALIDATE_XMIT struct sock *sk = skb->sk; if (sk && sk_fullsock(sk) && sk->sk_validate_xmit_skb) { skb = sk->sk_validate_xmit_skb(sk, dev, skb); #ifdef CONFIG_TLS_DEVICE } else if (unlikely(skb->decrypted)) { pr_warn_ratelimited("unencrypted skb with no associated socket - dropping\n"); kfree_skb(skb); skb = NULL; #endif } #endif return skb; } /* This helper checks if a socket is a LISTEN or NEW_SYN_RECV * SYNACK messages can be attached to either ones (depending on SYNCOOKIE) */ static inline bool sk_listener(const struct sock *sk) { return (1 << sk->sk_state) & (TCPF_LISTEN | TCPF_NEW_SYN_RECV); } void sock_enable_timestamp(struct sock *sk, enum sock_flags flag); int sock_recv_errqueue(struct sock *sk, struct msghdr *msg, int len, int level, int type); bool sk_ns_capable(const struct sock *sk, struct user_namespace *user_ns, int cap); bool sk_capable(const struct sock *sk, int cap); bool sk_net_capable(const struct sock *sk, int cap); void sk_get_meminfo(const struct sock *sk, u32 *meminfo); /* Take into consideration the size of the struct sk_buff overhead in the * determination of these values, since that is non-constant across * platforms. This makes socket queueing behavior and performance * not depend upon such differences. */ #define _SK_MEM_PACKETS 256 #define _SK_MEM_OVERHEAD SKB_TRUESIZE(256) #define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) #define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS) extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; extern int sysctl_tstamp_allow_data; extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default; extern __u32 sysctl_rmem_default; DECLARE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key); static inline int sk_get_wmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_wmem ? */ if (proto->sysctl_wmem_offset) return *(int *)((void *)sock_net(sk) + proto->sysctl_wmem_offset); return *proto->sysctl_wmem; } static inline int sk_get_rmem0(const struct sock *sk, const struct proto *proto) { /* Does this proto have per netns sysctl_rmem ? */ if (proto->sysctl_rmem_offset) return *(int *)((void *)sock_net(sk) + proto->sysctl_rmem_offset); return *proto->sysctl_rmem; } /* Default TCP Small queue budget is ~1 ms of data (1sec >> 10) * Some wifi drivers need to tweak it to get more chunks. * They can use this helper from their ndo_start_xmit() */ static inline void sk_pacing_shift_update(struct sock *sk, int val) { if (!sk || !sk_fullsock(sk) || READ_ONCE(sk->sk_pacing_shift) == val) return; WRITE_ONCE(sk->sk_pacing_shift, val); } /* if a socket is bound to a device, check that the given device * index is either the same or that the socket is bound to an L3 * master device and the given device index is also enslaved to * that L3 master */ static inline bool sk_dev_equal_l3scope(struct sock *sk, int dif) { int mdif; if (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif) return true; mdif = l3mdev_master_ifindex_by_index(sock_net(sk), dif); if (mdif && mdif == sk->sk_bound_dev_if) return true; return false; } void sock_def_readable(struct sock *sk); int sock_bindtoindex(struct sock *sk, int ifindex, bool lock_sk); void sock_enable_timestamps(struct sock *sk); void sock_no_linger(struct sock *sk); void sock_set_keepalive(struct sock *sk); void sock_set_priority(struct sock *sk, u32 priority); void sock_set_rcvbuf(struct sock *sk, int val); void sock_set_mark(struct sock *sk, u32 val); void sock_set_reuseaddr(struct sock *sk); void sock_set_reuseport(struct sock *sk); void sock_set_sndtimeo(struct sock *sk, s64 secs); int sock_bind_add(struct sock *sk, struct sockaddr *addr, int addr_len); #endif /* _SOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NET_NEIGHBOUR_H #define _NET_NEIGHBOUR_H #include <linux/neighbour.h> /* * Generic neighbour manipulation * * Authors: * Pedro Roque <roque@di.fc.ul.pt> * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru> * * Changes: * * Harald Welte: <laforge@gnumonks.org> * - Add neighbour cache statistics like rtstat */ #include <linux/atomic.h> #include <linux/refcount.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/rcupdate.h> #include <linux/seq_file.h> #include <linux/bitmap.h> #include <linux/err.h> #include <linux/sysctl.h> #include <linux/workqueue.h> #include <net/rtnetlink.h> /* * NUD stands for "neighbor unreachability detection" */ #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) #define NUD_CONNECTED (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE) struct neighbour; enum { NEIGH_VAR_MCAST_PROBES, NEIGH_VAR_UCAST_PROBES, NEIGH_VAR_APP_PROBES, NEIGH_VAR_MCAST_REPROBES, NEIGH_VAR_RETRANS_TIME, NEIGH_VAR_BASE_REACHABLE_TIME, NEIGH_VAR_DELAY_PROBE_TIME, NEIGH_VAR_GC_STALETIME, NEIGH_VAR_QUEUE_LEN_BYTES, NEIGH_VAR_PROXY_QLEN, NEIGH_VAR_ANYCAST_DELAY, NEIGH_VAR_PROXY_DELAY, NEIGH_VAR_LOCKTIME, #define NEIGH_VAR_DATA_MAX (NEIGH_VAR_LOCKTIME + 1) /* Following are used as a second way to access one of the above */ NEIGH_VAR_QUEUE_LEN, /* same data as NEIGH_VAR_QUEUE_LEN_BYTES */ NEIGH_VAR_RETRANS_TIME_MS, /* same data as NEIGH_VAR_RETRANS_TIME */ NEIGH_VAR_BASE_REACHABLE_TIME_MS, /* same data as NEIGH_VAR_BASE_REACHABLE_TIME */ /* Following are used by "default" only */ NEIGH_VAR_GC_INTERVAL, NEIGH_VAR_GC_THRESH1, NEIGH_VAR_GC_THRESH2, NEIGH_VAR_GC_THRESH3, NEIGH_VAR_MAX }; struct neigh_parms { possible_net_t net; struct net_device *dev; struct list_head list; int (*neigh_setup)(struct neighbour *); struct neigh_table *tbl; void *sysctl_table; int dead; refcount_t refcnt; struct rcu_head rcu_head; int reachable_time; int data[NEIGH_VAR_DATA_MAX]; DECLARE_BITMAP(data_state, NEIGH_VAR_DATA_MAX); }; static inline void neigh_var_set(struct neigh_parms *p, int index, int val) { set_bit(index, p->data_state); p->data[index] = val; } #define NEIGH_VAR(p, attr) ((p)->data[NEIGH_VAR_ ## attr]) /* In ndo_neigh_setup, NEIGH_VAR_INIT should be used. * In other cases, NEIGH_VAR_SET should be used. */ #define NEIGH_VAR_INIT(p, attr, val) (NEIGH_VAR(p, attr) = val) #define NEIGH_VAR_SET(p, attr, val) neigh_var_set(p, NEIGH_VAR_ ## attr, val) static inline void neigh_parms_data_state_setall(struct neigh_parms *p) { bitmap_fill(p->data_state, NEIGH_VAR_DATA_MAX); } static inline void neigh_parms_data_state_cleanall(struct neigh_parms *p) { bitmap_zero(p->data_state, NEIGH_VAR_DATA_MAX); } struct neigh_statistics { unsigned long allocs; /* number of allocated neighs */ unsigned long destroys; /* number of destroyed neighs */ unsigned long hash_grows; /* number of hash resizes */ unsigned long res_failed; /* number of failed resolutions */ unsigned long lookups; /* number of lookups */ unsigned long hits; /* number of hits (among lookups) */ unsigned long rcv_probes_mcast; /* number of received mcast ipv6 */ unsigned long rcv_probes_ucast; /* number of received ucast ipv6 */ unsigned long periodic_gc_runs; /* number of periodic GC runs */ unsigned long forced_gc_runs; /* number of forced GC runs */ unsigned long unres_discards; /* number of unresolved drops */ unsigned long table_fulls; /* times even gc couldn't help */ }; #define NEIGH_CACHE_STAT_INC(tbl, field) this_cpu_inc((tbl)->stats->field) struct neighbour { struct neighbour __rcu *next; struct neigh_table *tbl; struct neigh_parms *parms; unsigned long confirmed; unsigned long updated; rwlock_t lock; refcount_t refcnt; unsigned int arp_queue_len_bytes; struct sk_buff_head arp_queue; struct timer_list timer; unsigned long used; atomic_t probes; __u8 flags; __u8 nud_state; __u8 type; __u8 dead; u8 protocol; seqlock_t ha_lock; unsigned char ha[ALIGN(MAX_ADDR_LEN, sizeof(unsigned long))] __aligned(8); struct hh_cache hh; int (*output)(struct neighbour *, struct sk_buff *); const struct neigh_ops *ops; struct list_head gc_list; struct rcu_head rcu; struct net_device *dev; u8 primary_key[0]; } __randomize_layout; struct neigh_ops { int family; void (*solicit)(struct neighbour *, struct sk_buff *); void (*error_report)(struct neighbour *, struct sk_buff *); int (*output)(struct neighbour *, struct sk_buff *); int (*connected_output)(struct neighbour *, struct sk_buff *); }; struct pneigh_entry { struct pneigh_entry *next; possible_net_t net; struct net_device *dev; u8 flags; u8 protocol; u8 key[]; }; /* * neighbour table manipulation */ #define NEIGH_NUM_HASH_RND 4 struct neigh_hash_table { struct neighbour __rcu **hash_buckets; unsigned int hash_shift; __u32 hash_rnd[NEIGH_NUM_HASH_RND]; struct rcu_head rcu; }; struct neigh_table { int family; unsigned int entry_size; unsigned int key_len; __be16 protocol; __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd); bool (*key_eq)(const struct neighbour *, const void *pkey); int (*constructor)(struct neighbour *); int (*pconstructor)(struct pneigh_entry *); void (*pdestructor)(struct pneigh_entry *); void (*proxy_redo)(struct sk_buff *skb); int (*is_multicast)(const void *pkey); bool (*allow_add)(const struct net_device *dev, struct netlink_ext_ack *extack); char *id; struct neigh_parms parms; struct list_head parms_list; int gc_interval; int gc_thresh1; int gc_thresh2; int gc_thresh3; unsigned long last_flush; struct delayed_work gc_work; struct timer_list proxy_timer; struct sk_buff_head proxy_queue; atomic_t entries; atomic_t gc_entries; struct list_head gc_list; rwlock_t lock; unsigned long last_rand; struct neigh_statistics __percpu *stats; struct neigh_hash_table __rcu *nht; struct pneigh_entry **phash_buckets; }; enum { NEIGH_ARP_TABLE = 0, NEIGH_ND_TABLE = 1, NEIGH_DN_TABLE = 2, NEIGH_NR_TABLES, NEIGH_LINK_TABLE = NEIGH_NR_TABLES /* Pseudo table for neigh_xmit */ }; static inline int neigh_parms_family(struct neigh_parms *p) { return p->tbl->family; } #define NEIGH_PRIV_ALIGN sizeof(long long) #define NEIGH_ENTRY_SIZE(size) ALIGN((size), NEIGH_PRIV_ALIGN) static inline void *neighbour_priv(const struct neighbour *n) { return (char *)n + n->tbl->entry_size; } /* flags for neigh_update() */ #define NEIGH_UPDATE_F_OVERRIDE 0x00000001 #define NEIGH_UPDATE_F_WEAK_OVERRIDE 0x00000002 #define NEIGH_UPDATE_F_OVERRIDE_ISROUTER 0x00000004 #define NEIGH_UPDATE_F_USE 0x10000000 #define NEIGH_UPDATE_F_EXT_LEARNED 0x20000000 #define NEIGH_UPDATE_F_ISROUTER 0x40000000 #define NEIGH_UPDATE_F_ADMIN 0x80000000 extern const struct nla_policy nda_policy[]; static inline bool neigh_key_eq16(const struct neighbour *n, const void *pkey) { return *(const u16 *)n->primary_key == *(const u16 *)pkey; } static inline bool neigh_key_eq32(const struct neighbour *n, const void *pkey) { return *(const u32 *)n->primary_key == *(const u32 *)pkey; } static inline bool neigh_key_eq128(const struct neighbour *n, const void *pkey) { const u32 *n32 = (const u32 *)n->primary_key; const u32 *p32 = pkey; return ((n32[0] ^ p32[0]) | (n32[1] ^ p32[1]) | (n32[2] ^ p32[2]) | (n32[3] ^ p32[3])) == 0; } static inline struct neighbour *___neigh_lookup_noref( struct neigh_table *tbl, bool (*key_eq)(const struct neighbour *n, const void *pkey), __u32 (*hash)(const void *pkey, const struct net_device *dev, __u32 *hash_rnd), const void *pkey, struct net_device *dev) { struct neigh_hash_table *nht = rcu_dereference_bh(tbl->nht); struct neighbour *n; u32 hash_val; hash_val = hash(pkey, dev, nht->hash_rnd) >> (32 - nht->hash_shift); for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); n != NULL; n = rcu_dereference_bh(n->next)) { if (n->dev == dev && key_eq(n, pkey)) return n; } return NULL; } static inline struct neighbour *__neigh_lookup_noref(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return ___neigh_lookup_noref(tbl, tbl->key_eq, tbl->hash, pkey, dev); } void neigh_table_init(int index, struct neigh_table *tbl); int neigh_table_clear(int index, struct neigh_table *tbl); struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev); struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, const void *pkey); struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev, bool want_ref); static inline struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { return __neigh_create(tbl, pkey, dev, true); } void neigh_destroy(struct neighbour *neigh); int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb); int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, u32 flags, u32 nlmsg_pid); void __neigh_set_probe_once(struct neighbour *neigh); bool neigh_remove_one(struct neighbour *ndel, struct neigh_table *tbl); void neigh_changeaddr(struct neigh_table *tbl, struct net_device *dev); int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev); int neigh_carrier_down(struct neigh_table *tbl, struct net_device *dev); int neigh_resolve_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_connected_output(struct neighbour *neigh, struct sk_buff *skb); int neigh_direct_output(struct neighbour *neigh, struct sk_buff *skb); struct neighbour *neigh_event_ns(struct neigh_table *tbl, u8 *lladdr, void *saddr, struct net_device *dev); struct neigh_parms *neigh_parms_alloc(struct net_device *dev, struct neigh_table *tbl); void neigh_parms_release(struct neigh_table *tbl, struct neigh_parms *parms); static inline struct net *neigh_parms_net(const struct neigh_parms *parms) { return read_pnet(&parms->net); } unsigned long neigh_rand_reach_time(unsigned long base); void pneigh_enqueue(struct neigh_table *tbl, struct neigh_parms *p, struct sk_buff *skb); struct pneigh_entry *pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev, int creat); struct pneigh_entry *__pneigh_lookup(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); int pneigh_delete(struct neigh_table *tbl, struct net *net, const void *key, struct net_device *dev); static inline struct net *pneigh_net(const struct pneigh_entry *pneigh) { return read_pnet(&pneigh->net); } void neigh_app_ns(struct neighbour *n); void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie); void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)); int neigh_xmit(int fam, struct net_device *, const void *, struct sk_buff *); void pneigh_for_each(struct neigh_table *tbl, void (*cb)(struct pneigh_entry *)); struct neigh_seq_state { struct seq_net_private p; struct neigh_table *tbl; struct neigh_hash_table *nht; void *(*neigh_sub_iter)(struct neigh_seq_state *state, struct neighbour *n, loff_t *pos); unsigned int bucket; unsigned int flags; #define NEIGH_SEQ_NEIGH_ONLY 0x00000001 #define NEIGH_SEQ_IS_PNEIGH 0x00000002 #define NEIGH_SEQ_SKIP_NOARP 0x00000004 }; void *neigh_seq_start(struct seq_file *, loff_t *, struct neigh_table *, unsigned int); void *neigh_seq_next(struct seq_file *, void *, loff_t *); void neigh_seq_stop(struct seq_file *, void *); int neigh_proc_dointvec(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_proc_dointvec_ms_jiffies(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos); int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p, proc_handler *proc_handler); void neigh_sysctl_unregister(struct neigh_parms *p); static inline void __neigh_parms_put(struct neigh_parms *parms) { refcount_dec(&parms->refcnt); } static inline struct neigh_parms *neigh_parms_clone(struct neigh_parms *parms) { refcount_inc(&parms->refcnt); return parms; } /* * Neighbour references */ static inline void neigh_release(struct neighbour *neigh) { if (refcount_dec_and_test(&neigh->refcnt)) neigh_destroy(neigh); } static inline struct neighbour * neigh_clone(struct neighbour *neigh) { if (neigh) refcount_inc(&neigh->refcnt); return neigh; } #define neigh_hold(n) refcount_inc(&(n)->refcnt) static inline int neigh_event_send(struct neighbour *neigh, struct sk_buff *skb) { unsigned long now = jiffies; if (READ_ONCE(neigh->used) != now) WRITE_ONCE(neigh->used, now); if (!(neigh->nud_state&(NUD_CONNECTED|NUD_DELAY|NUD_PROBE))) return __neigh_event_send(neigh, skb); return 0; } #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER) static inline int neigh_hh_bridge(struct hh_cache *hh, struct sk_buff *skb) { unsigned int seq, hh_alen; do { seq = read_seqbegin(&hh->hh_lock); hh_alen = HH_DATA_ALIGN(ETH_HLEN); memcpy(skb->data - hh_alen, hh->hh_data, ETH_ALEN + hh_alen - ETH_HLEN); } while (read_seqretry(&hh->hh_lock, seq)); return 0; } #endif static inline int neigh_hh_output(const struct hh_cache *hh, struct sk_buff *skb) { unsigned int hh_alen = 0; unsigned int seq; unsigned int hh_len; do { seq = read_seqbegin(&hh->hh_lock); hh_len = READ_ONCE(hh->hh_len); if (likely(hh_len <= HH_DATA_MOD)) { hh_alen = HH_DATA_MOD; /* skb_push() would proceed silently if we have room for * the unaligned size but not for the aligned size: * check headroom explicitly. */ if (likely(skb_headroom(skb) >= HH_DATA_MOD)) { /* this is inlined by gcc */ memcpy(skb->data - HH_DATA_MOD, hh->hh_data, HH_DATA_MOD); } } else { hh_alen = HH_DATA_ALIGN(hh_len); if (likely(skb_headroom(skb) >= hh_alen)) { memcpy(skb->data - hh_alen, hh->hh_data, hh_alen); } } } while (read_seqretry(&hh->hh_lock, seq)); if (WARN_ON_ONCE(skb_headroom(skb) < hh_alen)) { kfree_skb(skb); return NET_XMIT_DROP; } __skb_push(skb, hh_len); return dev_queue_xmit(skb); } static inline int neigh_output(struct neighbour *n, struct sk_buff *skb, bool skip_cache) { const struct hh_cache *hh = &n->hh; /* n->nud_state and hh->hh_len could be changed under us. * neigh_hh_output() is taking care of the race later. */ if (!skip_cache && (READ_ONCE(n->nud_state) & NUD_CONNECTED) && READ_ONCE(hh->hh_len)) return neigh_hh_output(hh, skb); return n->output(n, skb); } static inline struct neighbour * __neigh_lookup(struct neigh_table *tbl, const void *pkey, struct net_device *dev, int creat) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n || !creat) return n; n = neigh_create(tbl, pkey, dev); return IS_ERR(n) ? NULL : n; } static inline struct neighbour * __neigh_lookup_errno(struct neigh_table *tbl, const void *pkey, struct net_device *dev) { struct neighbour *n = neigh_lookup(tbl, pkey, dev); if (n) return n; return neigh_create(tbl, pkey, dev); } struct neighbour_cb { unsigned long sched_next; unsigned int flags; }; #define LOCALLY_ENQUEUED 0x1 #define NEIGH_CB(skb) ((struct neighbour_cb *)(skb)->cb) static inline void neigh_ha_snapshot(char *dst, const struct neighbour *n, const struct net_device *dev) { unsigned int seq; do { seq = read_seqbegin(&n->ha_lock); memcpy(dst, n->ha, dev->addr_len); } while (read_seqretry(&n->ha_lock, seq)); } static inline void neigh_update_is_router(struct neighbour *neigh, u32 flags, int *notify) { u8 ndm_flags = 0; ndm_flags |= (flags & NEIGH_UPDATE_F_ISROUTER) ? NTF_ROUTER : 0; if ((neigh->flags ^ ndm_flags) & NTF_ROUTER) { if (ndm_flags & NTF_ROUTER) neigh->flags |= NTF_ROUTER; else neigh->flags &= ~NTF_ROUTER; *notify = 1; } } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_RT_H #define _LINUX_SCHED_RT_H #include <linux/sched.h> struct task_struct; static inline int rt_prio(int prio) { if (unlikely(prio < MAX_RT_PRIO)) return 1; return 0; } static inline int rt_task(struct task_struct *p) { return rt_prio(p->prio); } static inline bool task_is_realtime(struct task_struct *tsk) { int policy = tsk->policy; if (policy == SCHED_FIFO || policy == SCHED_RR) return true; if (policy == SCHED_DEADLINE) return true; return false; } #ifdef CONFIG_RT_MUTEXES /* * Must hold either p->pi_lock or task_rq(p)->lock. */ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) { return p->pi_top_task; } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { return tsk->pi_blocked_on != NULL; } #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) static inline bool tsk_is_pi_blocked(struct task_struct *tsk) { return false; } #endif extern void normalize_rt_tasks(void); /* * default timeslice is 100 msecs (used only for SCHED_RR tasks). * Timeslices get refilled after they expire. */ #define RR_TIMESLICE (100 * HZ / 1000) #endif /* _LINUX_SCHED_RT_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _SCSI_SCSI_HOST_H #define _SCSI_SCSI_HOST_H #include <linux/device.h> #include <linux/list.h> #include <linux/types.h> #include <linux/workqueue.h> #include <linux/mutex.h> #include <linux/seq_file.h> #include <linux/blk-mq.h> #include <scsi/scsi.h> struct block_device; struct completion; struct module; struct scsi_cmnd; struct scsi_device; struct scsi_host_cmd_pool; struct scsi_target; struct Scsi_Host; struct scsi_host_cmd_pool; struct scsi_transport_template; #define SG_ALL SG_CHUNK_SIZE #define MODE_UNKNOWN 0x00 #define MODE_INITIATOR 0x01 #define MODE_TARGET 0x02 struct scsi_host_template { struct module *module; const char *name; /* * The info function will return whatever useful information the * developer sees fit. If not provided, then the name field will * be used instead. * * Status: OPTIONAL */ const char *(* info)(struct Scsi_Host *); /* * Ioctl interface * * Status: OPTIONAL */ int (*ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #ifdef CONFIG_COMPAT /* * Compat handler. Handle 32bit ABI. * When unknown ioctl is passed return -ENOIOCTLCMD. * * Status: OPTIONAL */ int (*compat_ioctl)(struct scsi_device *dev, unsigned int cmd, void __user *arg); #endif int (*init_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); int (*exit_cmd_priv)(struct Scsi_Host *shost, struct scsi_cmnd *cmd); /* * The queuecommand function is used to queue up a scsi * command block to the LLDD. When the driver finished * processing the command the done callback is invoked. * * If queuecommand returns 0, then the driver has accepted the * command. It must also push it to the HBA if the scsi_cmnd * flag SCMD_LAST is set, or if the driver does not implement * commit_rqs. The done() function must be called on the command * when the driver has finished with it. (you may call done on the * command before queuecommand returns, but in this case you * *must* return 0 from queuecommand). * * Queuecommand may also reject the command, in which case it may * not touch the command and must not call done() for it. * * There are two possible rejection returns: * * SCSI_MLQUEUE_DEVICE_BUSY: Block this device temporarily, but * allow commands to other devices serviced by this host. * * SCSI_MLQUEUE_HOST_BUSY: Block all devices served by this * host temporarily. * * For compatibility, any other non-zero return is treated the * same as SCSI_MLQUEUE_HOST_BUSY. * * NOTE: "temporarily" means either until the next command for# * this device/host completes, or a period of time determined by * I/O pressure in the system if there are no other outstanding * commands. * * STATUS: REQUIRED */ int (* queuecommand)(struct Scsi_Host *, struct scsi_cmnd *); /* * The commit_rqs function is used to trigger a hardware * doorbell after some requests have been queued with * queuecommand, when an error is encountered before sending * the request with SCMD_LAST set. * * STATUS: OPTIONAL */ void (*commit_rqs)(struct Scsi_Host *, u16); /* * This is an error handling strategy routine. You don't need to * define one of these if you don't want to - there is a default * routine that is present that should work in most cases. For those * driver authors that have the inclination and ability to write their * own strategy routine, this is where it is specified. Note - the * strategy routine is *ALWAYS* run in the context of the kernel eh * thread. Thus you are guaranteed to *NOT* be in an interrupt * handler when you execute this, and you are also guaranteed to * *NOT* have any other commands being queued while you are in the * strategy routine. When you return from this function, operations * return to normal. * * See scsi_error.c scsi_unjam_host for additional comments about * what this function should and should not be attempting to do. * * Status: REQUIRED (at least one of them) */ int (* eh_abort_handler)(struct scsi_cmnd *); int (* eh_device_reset_handler)(struct scsi_cmnd *); int (* eh_target_reset_handler)(struct scsi_cmnd *); int (* eh_bus_reset_handler)(struct scsi_cmnd *); int (* eh_host_reset_handler)(struct scsi_cmnd *); /* * Before the mid layer attempts to scan for a new device where none * currently exists, it will call this entry in your driver. Should * your driver need to allocate any structs or perform any other init * items in order to send commands to a currently unused target/lun * combo, then this is where you can perform those allocations. This * is specifically so that drivers won't have to perform any kind of * "is this a new device" checks in their queuecommand routine, * thereby making the hot path a bit quicker. * * Return values: 0 on success, non-0 on failure * * Deallocation: If we didn't find any devices at this ID, you will * get an immediate call to slave_destroy(). If we find something * here then you will get a call to slave_configure(), then the * device will be used for however long it is kept around, then when * the device is removed from the system (or * possibly at reboot * time), you will then get a call to slave_destroy(). This is * assuming you implement slave_configure and slave_destroy. * However, if you allocate memory and hang it off the device struct, * then you must implement the slave_destroy() routine at a minimum * in order to avoid leaking memory * each time a device is tore down. * * Status: OPTIONAL */ int (* slave_alloc)(struct scsi_device *); /* * Once the device has responded to an INQUIRY and we know the * device is online, we call into the low level driver with the * struct scsi_device *. If the low level device driver implements * this function, it *must* perform the task of setting the queue * depth on the device. All other tasks are optional and depend * on what the driver supports and various implementation details. * * Things currently recommended to be handled at this time include: * * 1. Setting the device queue depth. Proper setting of this is * described in the comments for scsi_change_queue_depth. * 2. Determining if the device supports the various synchronous * negotiation protocols. The device struct will already have * responded to INQUIRY and the results of the standard items * will have been shoved into the various device flag bits, eg. * device->sdtr will be true if the device supports SDTR messages. * 3. Allocating command structs that the device will need. * 4. Setting the default timeout on this device (if needed). * 5. Anything else the low level driver might want to do on a device * specific setup basis... * 6. Return 0 on success, non-0 on error. The device will be marked * as offline on error so that no access will occur. If you return * non-0, your slave_destroy routine will never get called for this * device, so don't leave any loose memory hanging around, clean * up after yourself before returning non-0 * * Status: OPTIONAL */ int (* slave_configure)(struct scsi_device *); /* * Immediately prior to deallocating the device and after all activity * has ceased the mid layer calls this point so that the low level * driver may completely detach itself from the scsi device and vice * versa. The low level driver is responsible for freeing any memory * it allocated in the slave_alloc or slave_configure calls. * * Status: OPTIONAL */ void (* slave_destroy)(struct scsi_device *); /* * Before the mid layer attempts to scan for a new device attached * to a target where no target currently exists, it will call this * entry in your driver. Should your driver need to allocate any * structs or perform any other init items in order to send commands * to a currently unused target, then this is where you can perform * those allocations. * * Return values: 0 on success, non-0 on failure * * Status: OPTIONAL */ int (* target_alloc)(struct scsi_target *); /* * Immediately prior to deallocating the target structure, and * after all activity to attached scsi devices has ceased, the * midlayer calls this point so that the driver may deallocate * and terminate any references to the target. * * Status: OPTIONAL */ void (* target_destroy)(struct scsi_target *); /* * If a host has the ability to discover targets on its own instead * of scanning the entire bus, it can fill in this function and * call scsi_scan_host(). This function will be called periodically * until it returns 1 with the scsi_host and the elapsed time of * the scan in jiffies. * * Status: OPTIONAL */ int (* scan_finished)(struct Scsi_Host *, unsigned long); /* * If the host wants to be called before the scan starts, but * after the midlayer has set up ready for the scan, it can fill * in this function. * * Status: OPTIONAL */ void (* scan_start)(struct Scsi_Host *); /* * Fill in this function to allow the queue depth of this host * to be changeable (on a per device basis). Returns either * the current queue depth setting (may be different from what * was passed in) or an error. An error should only be * returned if the requested depth is legal but the driver was * unable to set it. If the requested depth is illegal, the * driver should set and return the closest legal queue depth. * * Status: OPTIONAL */ int (* change_queue_depth)(struct scsi_device *, int); /* * This functions lets the driver expose the queue mapping * to the block layer. * * Status: OPTIONAL */ int (* map_queues)(struct Scsi_Host *shost); /* * Check if scatterlists need to be padded for DMA draining. * * Status: OPTIONAL */ bool (* dma_need_drain)(struct request *rq); /* * This function determines the BIOS parameters for a given * harddisk. These tend to be numbers that are made up by * the host adapter. Parameters: * size, device, list (heads, sectors, cylinders) * * Status: OPTIONAL */ int (* bios_param)(struct scsi_device *, struct block_device *, sector_t, int []); /* * This function is called when one or more partitions on the * device reach beyond the end of the device. * * Status: OPTIONAL */ void (*unlock_native_capacity)(struct scsi_device *); /* * Can be used to export driver statistics and other infos to the * world outside the kernel ie. userspace and it also provides an * interface to feed the driver with information. * * Status: OBSOLETE */ int (*show_info)(struct seq_file *, struct Scsi_Host *); int (*write_info)(struct Scsi_Host *, char *, int); /* * This is an optional routine that allows the transport to become * involved when a scsi io timer fires. The return value tells the * timer routine how to finish the io timeout handling. * * Status: OPTIONAL */ enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *); /* This is an optional routine that allows transport to initiate * LLD adapter or firmware reset using sysfs attribute. * * Return values: 0 on success, -ve value on failure. * * Status: OPTIONAL */ int (*host_reset)(struct Scsi_Host *shost, int reset_type); #define SCSI_ADAPTER_RESET 1 #define SCSI_FIRMWARE_RESET 2 /* * Name of proc directory */ const char *proc_name; /* * Used to store the procfs directory if a driver implements the * show_info method. */ struct proc_dir_entry *proc_dir; /* * This determines if we will use a non-interrupt driven * or an interrupt driven scheme. It is set to the maximum number * of simultaneous commands a single hw queue in HBA will accept. */ int can_queue; /* * In many instances, especially where disconnect / reconnect are * supported, our host also has an ID on the SCSI bus. If this is * the case, then it must be reserved. Please set this_id to -1 if * your setup is in single initiator mode, and the host lacks an * ID. */ int this_id; /* * This determines the degree to which the host adapter is capable * of scatter-gather. */ unsigned short sg_tablesize; unsigned short sg_prot_tablesize; /* * Set this if the host adapter has limitations beside segment count. */ unsigned int max_sectors; /* * Maximum size in bytes of a single segment. */ unsigned int max_segment_size; /* * DMA scatter gather segment boundary limit. A segment crossing this * boundary will be split in two. */ unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * This specifies "machine infinity" for host templates which don't * limit the transfer size. Note this limit represents an absolute * maximum, and may be over the transfer limits allowed for * individual devices (e.g. 256 for SCSI-1). */ #define SCSI_DEFAULT_MAX_SECTORS 1024 /* * True if this host adapter can make good use of linked commands. * This will allow more than one command to be queued to a given * unit on a given host. Set this to the maximum number of command * blocks to be provided for each device. Set this to 1 for one * command block per lun, 2 for two, etc. Do not set this to 0. * You should make sure that the host adapter will do the right thing * before you try setting this above 1. */ short cmd_per_lun; /* * present contains counter indicating how many boards of this * type were found when we did the scan. */ unsigned char present; /* If use block layer to manage tags, this is tag allocation policy */ int tag_alloc_policy; /* * Track QUEUE_FULL events and reduce queue depth on demand. */ unsigned track_queue_depth:1; /* * This specifies the mode that a LLD supports. */ unsigned supported_mode:2; /* * True if this host adapter uses unchecked DMA onto an ISA bus. */ unsigned unchecked_isa_dma:1; /* * True for emulated SCSI host adapters (e.g. ATAPI). */ unsigned emulated:1; /* * True if the low-level driver performs its own reset-settle delays. */ unsigned skip_settle_delay:1; /* True if the controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* * Countdown for host blocking with no commands outstanding. */ unsigned int max_host_blocked; /* * Default value for the blocking. If the queue is empty, * host_blocked counts down in the request_fn until it restarts * host operations as zero is reached. * * FIXME: This should probably be a value in the template */ #define SCSI_DEFAULT_HOST_BLOCKED 7 /* * Pointer to the sysfs class properties for this host, NULL terminated. */ struct device_attribute **shost_attrs; /* * Pointer to the SCSI device properties for this host, NULL terminated. */ struct device_attribute **sdev_attrs; /* * Pointer to the SCSI device attribute groups for this host, * NULL terminated. */ const struct attribute_group **sdev_groups; /* * Vendor Identifier associated with the host * * Note: When specifying vendor_id, be sure to read the * Vendor Type and ID formatting requirements specified in * scsi_netlink.h */ u64 vendor_id; /* * Additional per-command data allocated for the driver. */ unsigned int cmd_size; struct scsi_host_cmd_pool *cmd_pool; /* Delay for runtime autosuspend */ int rpm_autosuspend_delay; }; /* * Temporary #define for host lock push down. Can be removed when all * drivers have been updated to take advantage of unlocked * queuecommand. * */ #define DEF_SCSI_QCMD(func_name) \ int func_name(struct Scsi_Host *shost, struct scsi_cmnd *cmd) \ { \ unsigned long irq_flags; \ int rc; \ spin_lock_irqsave(shost->host_lock, irq_flags); \ rc = func_name##_lck (cmd, cmd->scsi_done); \ spin_unlock_irqrestore(shost->host_lock, irq_flags); \ return rc; \ } /* * shost state: If you alter this, you also need to alter scsi_sysfs.c * (for the ascii descriptions) and the state model enforcer: * scsi_host_set_state() */ enum scsi_host_state { SHOST_CREATED = 1, SHOST_RUNNING, SHOST_CANCEL, SHOST_DEL, SHOST_RECOVERY, SHOST_CANCEL_RECOVERY, SHOST_DEL_RECOVERY, }; struct Scsi_Host { /* * __devices is protected by the host_lock, but you should * usually use scsi_device_lookup / shost_for_each_device * to access it and don't care about locking yourself. * In the rare case of being in irq context you can use * their __ prefixed variants with the lock held. NEVER * access this list directly from a driver. */ struct list_head __devices; struct list_head __targets; struct list_head starved_list; spinlock_t default_lock; spinlock_t *host_lock; struct mutex scan_mutex;/* serialize scanning activity */ struct list_head eh_cmd_q; struct task_struct * ehandler; /* Error recovery thread. */ struct completion * eh_action; /* Wait for specific actions on the host. */ wait_queue_head_t host_wait; struct scsi_host_template *hostt; struct scsi_transport_template *transportt; /* Area to keep a shared tag map */ struct blk_mq_tag_set tag_set; atomic_t host_blocked; unsigned int host_failed; /* commands that failed. protected by host_lock */ unsigned int host_eh_scheduled; /* EH scheduled without command */ unsigned int host_no; /* Used for IOCTL_GET_IDLUN, /proc/scsi et al. */ /* next two fields are used to bound the time spent in error handling */ int eh_deadline; unsigned long last_reset; /* * These three parameters can be used to allow for wide scsi, * and for host adapters that support multiple busses * The last two should be set to 1 more than the actual max id * or lun (e.g. 8 for SCSI parallel systems). */ unsigned int max_channel; unsigned int max_id; u64 max_lun; /* * This is a unique identifier that must be assigned so that we * have some way of identifying each detected host adapter properly * and uniquely. For hosts that do not support more than one card * in the system at one time, this does not need to be set. It is * initialized to 0 in scsi_register. */ unsigned int unique_id; /* * The maximum length of SCSI commands that this host can accept. * Probably 12 for most host adapters, but could be 16 for others. * or 260 if the driver supports variable length cdbs. * For drivers that don't set this field, a value of 12 is * assumed. */ unsigned short max_cmd_len; int this_id; int can_queue; short cmd_per_lun; short unsigned int sg_tablesize; short unsigned int sg_prot_tablesize; unsigned int max_sectors; unsigned int max_segment_size; unsigned long dma_boundary; unsigned long virt_boundary_mask; /* * In scsi-mq mode, the number of hardware queues supported by the LLD. * * Note: it is assumed that each hardware queue has a queue depth of * can_queue. In other words, the total queue depth per host * is nr_hw_queues * can_queue. However, for when host_tagset is set, * the total queue depth is can_queue. */ unsigned nr_hw_queues; unsigned active_mode:2; unsigned unchecked_isa_dma:1; /* * Host has requested that no further requests come through for the * time being. */ unsigned host_self_blocked:1; /* * Host uses correct SCSI ordering not PC ordering. The bit is * set for the minority of drivers whose authors actually read * the spec ;). */ unsigned reverse_ordering:1; /* Task mgmt function in progress */ unsigned tmf_in_progress:1; /* Asynchronous scan in progress */ unsigned async_scan:1; /* Don't resume host in EH */ unsigned eh_noresume:1; /* The controller does not support WRITE SAME */ unsigned no_write_same:1; /* True if the host uses host-wide tagspace */ unsigned host_tagset:1; /* Host responded with short (<36 bytes) INQUIRY result */ unsigned short_inquiry:1; /* The transport requires the LUN bits NOT to be stored in CDB[1] */ unsigned no_scsi2_lun_in_cdb:1; /* * Optional work queue to be utilized by the transport */ char work_q_name[20]; struct workqueue_struct *work_q; /* * Task management function work queue */ struct workqueue_struct *tmf_work_q; /* * Value host_blocked counts down from */ unsigned int max_host_blocked; /* Protection Information */ unsigned int prot_capabilities; unsigned char prot_guard_type; /* legacy crap */ unsigned long base; unsigned long io_port; unsigned char n_io_port; unsigned char dma_channel; unsigned int irq; enum scsi_host_state shost_state; /* ldm bits */ struct device shost_gendev, shost_dev; /* * Points to the transport data (if any) which is allocated * separately */ void *shost_data; /* * Points to the physical bus device we'd use to do DMA * Needed just in case we have virtual hosts. */ struct device *dma_dev; /* * We should ensure that this is aligned, both for better performance * and also because some compilers (m68k) don't automatically force * alignment to a long boundary. */ unsigned long hostdata[] /* Used for storage of host specific stuff */ __attribute__ ((aligned (sizeof(unsigned long)))); }; #define class_to_shost(d) \ container_of(d, struct Scsi_Host, shost_dev) #define shost_printk(prefix, shost, fmt, a...) \ dev_printk(prefix, &(shost)->shost_gendev, fmt, ##a) static inline void *shost_priv(struct Scsi_Host *shost) { return (void *)shost->hostdata; } int scsi_is_host_device(const struct device *); static inline struct Scsi_Host *dev_to_shost(struct device *dev) { while (!scsi_is_host_device(dev)) { if (!dev->parent) return NULL; dev = dev->parent; } return container_of(dev, struct Scsi_Host, shost_gendev); } static inline int scsi_host_in_recovery(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RECOVERY || shost->shost_state == SHOST_CANCEL_RECOVERY || shost->shost_state == SHOST_DEL_RECOVERY || shost->tmf_in_progress; } extern int scsi_queue_work(struct Scsi_Host *, struct work_struct *); extern void scsi_flush_work(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *, int); extern int __must_check scsi_add_host_with_dma(struct Scsi_Host *, struct device *, struct device *); extern void scsi_scan_host(struct Scsi_Host *); extern void scsi_rescan_device(struct device *); extern void scsi_remove_host(struct Scsi_Host *); extern struct Scsi_Host *scsi_host_get(struct Scsi_Host *); extern int scsi_host_busy(struct Scsi_Host *shost); extern void scsi_host_put(struct Scsi_Host *t); extern struct Scsi_Host *scsi_host_lookup(unsigned short); extern const char *scsi_host_state_name(enum scsi_host_state); extern void scsi_host_complete_all_commands(struct Scsi_Host *shost, int status); static inline int __must_check scsi_add_host(struct Scsi_Host *host, struct device *dev) { return scsi_add_host_with_dma(host, dev, dev); } static inline struct device *scsi_get_device(struct Scsi_Host *shost) { return shost->shost_gendev.parent; } /** * scsi_host_scan_allowed - Is scanning of this host allowed * @shost: Pointer to Scsi_Host. **/ static inline int scsi_host_scan_allowed(struct Scsi_Host *shost) { return shost->shost_state == SHOST_RUNNING || shost->shost_state == SHOST_RECOVERY; } extern void scsi_unblock_requests(struct Scsi_Host *); extern void scsi_block_requests(struct Scsi_Host *); extern int scsi_host_block(struct Scsi_Host *shost); extern int scsi_host_unblock(struct Scsi_Host *shost, int new_state); void scsi_host_busy_iter(struct Scsi_Host *, bool (*fn)(struct scsi_cmnd *, void *, bool), void *priv); struct class_container; /* * These two functions are used to allocate and free a pseudo device * which will connect to the host adapter itself rather than any * physical device. You must deallocate when you are done with the * thing. This physical pseudo-device isn't real and won't be available * from any high-level drivers. */ extern void scsi_free_host_dev(struct scsi_device *); extern struct scsi_device *scsi_get_host_dev(struct Scsi_Host *); /* * DIF defines the exchange of protection information between * initiator and SBC block device. * * DIX defines the exchange of protection information between OS and * initiator. */ enum scsi_host_prot_capabilities { SHOST_DIF_TYPE1_PROTECTION = 1 << 0, /* T10 DIF Type 1 */ SHOST_DIF_TYPE2_PROTECTION = 1 << 1, /* T10 DIF Type 2 */ SHOST_DIF_TYPE3_PROTECTION = 1 << 2, /* T10 DIF Type 3 */ SHOST_DIX_TYPE0_PROTECTION = 1 << 3, /* DIX between OS and HBA only */ SHOST_DIX_TYPE1_PROTECTION = 1 << 4, /* DIX with DIF Type 1 */ SHOST_DIX_TYPE2_PROTECTION = 1 << 5, /* DIX with DIF Type 2 */ SHOST_DIX_TYPE3_PROTECTION = 1 << 6, /* DIX with DIF Type 3 */ }; /* * SCSI hosts which support the Data Integrity Extensions must * indicate their capabilities by setting the prot_capabilities using * this call. */ static inline void scsi_host_set_prot(struct Scsi_Host *shost, unsigned int mask) { shost->prot_capabilities = mask; } static inline unsigned int scsi_host_get_prot(struct Scsi_Host *shost) { return shost->prot_capabilities; } static inline int scsi_host_prot_dma(struct Scsi_Host *shost) { return shost->prot_capabilities >= SHOST_DIX_TYPE0_PROTECTION; } static inline unsigned int scsi_host_dif_capable(struct Scsi_Host *shost, unsigned int target_type) { static unsigned char cap[] = { 0, SHOST_DIF_TYPE1_PROTECTION, SHOST_DIF_TYPE2_PROTECTION, SHOST_DIF_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type] ? target_type : 0; } static inline unsigned int scsi_host_dix_capable(struct Scsi_Host *shost, unsigned int target_type) { #if defined(CONFIG_BLK_DEV_INTEGRITY) static unsigned char cap[] = { SHOST_DIX_TYPE0_PROTECTION, SHOST_DIX_TYPE1_PROTECTION, SHOST_DIX_TYPE2_PROTECTION, SHOST_DIX_TYPE3_PROTECTION }; if (target_type >= ARRAY_SIZE(cap)) return 0; return shost->prot_capabilities & cap[target_type]; #endif return 0; } /* * All DIX-capable initiators must support the T10-mandated CRC * checksum. Controllers can optionally implement the IP checksum * scheme which has much lower impact on system performance. Note * that the main rationale for the checksum is to match integrity * metadata with data. Detecting bit errors are a job for ECC memory * and buses. */ enum scsi_host_guard_type { SHOST_DIX_GUARD_CRC = 1 << 0, SHOST_DIX_GUARD_IP = 1 << 1, }; static inline void scsi_host_set_guard(struct Scsi_Host *shost, unsigned char type) { shost->prot_guard_type = type; } static inline unsigned char scsi_host_get_guard(struct Scsi_Host *shost) { return shost->prot_guard_type; } extern int scsi_host_set_state(struct Scsi_Host *, enum scsi_host_state); #endif /* _SCSI_SCSI_HOST_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for the TCP protocol. * * Version: @(#)tcp.h 1.0.2 04/28/93 * * Author: Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG> */ #ifndef _LINUX_TCP_H #define _LINUX_TCP_H #include <linux/skbuff.h> #include <linux/win_minmax.h> #include <net/sock.h> #include <net/inet_connection_sock.h> #include <net/inet_timewait_sock.h> #include <uapi/linux/tcp.h> static inline struct tcphdr *tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_transport_header(skb); } static inline unsigned int __tcp_hdrlen(const struct tcphdr *th) { return th->doff * 4; } static inline unsigned int tcp_hdrlen(const struct sk_buff *skb) { return __tcp_hdrlen(tcp_hdr(skb)); } static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb) { return (struct tcphdr *)skb_inner_transport_header(skb); } static inline unsigned int inner_tcp_hdrlen(const struct sk_buff *skb) { return inner_tcp_hdr(skb)->doff * 4; } static inline unsigned int tcp_optlen(const struct sk_buff *skb) { return (tcp_hdr(skb)->doff - 5) * 4; } /* TCP Fast Open */ #define TCP_FASTOPEN_COOKIE_MIN 4 /* Min Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_MAX 16 /* Max Fast Open Cookie size in bytes */ #define TCP_FASTOPEN_COOKIE_SIZE 8 /* the size employed by this impl. */ /* TCP Fast Open Cookie as stored in memory */ struct tcp_fastopen_cookie { __le64 val[DIV_ROUND_UP(TCP_FASTOPEN_COOKIE_MAX, sizeof(u64))]; s8 len; bool exp; /* In RFC6994 experimental option format */ }; /* This defines a selective acknowledgement block. */ struct tcp_sack_block_wire { __be32 start_seq; __be32 end_seq; }; struct tcp_sack_block { u32 start_seq; u32 end_seq; }; /*These are used to set the sack_ok field in struct tcp_options_received */ #define TCP_SACK_SEEN (1 << 0) /*1 = peer is SACK capable, */ #define TCP_DSACK_SEEN (1 << 2) /*1 = DSACK was received from peer*/ struct tcp_options_received { /* PAWS/RTTM data */ int ts_recent_stamp;/* Time we stored ts_recent (for aging) */ u32 ts_recent; /* Time stamp to echo next */ u32 rcv_tsval; /* Time stamp value */ u32 rcv_tsecr; /* Time stamp echo reply */ u16 saw_tstamp : 1, /* Saw TIMESTAMP on last packet */ tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ dsack : 1, /* D-SACK is scheduled */ wscale_ok : 1, /* Wscale seen on SYN packet */ sack_ok : 3, /* SACK seen on SYN packet */ smc_ok : 1, /* SMC seen on SYN packet */ snd_wscale : 4, /* Window scaling received from sender */ rcv_wscale : 4; /* Window scaling to send to receiver */ u8 saw_unknown:1, /* Received unknown option */ unused:7; u8 num_sacks; /* Number of SACK blocks */ u16 user_mss; /* mss requested by user in ioctl */ u16 mss_clamp; /* Maximal mss, negotiated at connection setup */ }; static inline void tcp_clear_options(struct tcp_options_received *rx_opt) { rx_opt->tstamp_ok = rx_opt->sack_ok = 0; rx_opt->wscale_ok = rx_opt->snd_wscale = 0; #if IS_ENABLED(CONFIG_SMC) rx_opt->smc_ok = 0; #endif } /* This is the max number of SACKS that we'll generate and process. It's safe * to increase this, although since: * size = TCPOLEN_SACK_BASE_ALIGNED (4) + n * TCPOLEN_SACK_PERBLOCK (8) * only four options will fit in a standard TCP header */ #define TCP_NUM_SACKS 4 struct tcp_request_sock_ops; struct tcp_request_sock { struct inet_request_sock req; const struct tcp_request_sock_ops *af_specific; u64 snt_synack; /* first SYNACK sent time */ bool tfo_listener; bool is_mptcp; #if IS_ENABLED(CONFIG_MPTCP) bool drop_req; #endif u32 txhash; u32 rcv_isn; u32 snt_isn; u32 ts_off; u32 last_oow_ack_time; /* last SYNACK */ u32 rcv_nxt; /* the ack # by SYNACK. For * FastOpen it's the seq# * after data-in-SYN. */ u8 syn_tos; }; static inline struct tcp_request_sock *tcp_rsk(const struct request_sock *req) { return (struct tcp_request_sock *)req; } struct tcp_sock { /* inet_connection_sock has to be the first member of tcp_sock */ struct inet_connection_sock inet_conn; u16 tcp_header_len; /* Bytes of tcp header to send */ u16 gso_segs; /* Max number of segs per GSO packet */ /* * Header prediction flags * 0x5?10 << 16 + snd_wnd in net byte order */ __be32 pred_flags; /* * RFC793 variables by their proper names. This means you can * read the code and the spec side by side (and laugh ...) * See RFC793 and RFC1122. The RFC writes these in capitals. */ u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived * sum(delta(rcv_nxt)), or how many bytes * were acked. */ u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn * total number of segments in. */ u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn * total number of data segments in. */ u32 rcv_nxt; /* What we want to receive next */ u32 copied_seq; /* Head of yet unread data */ u32 rcv_wup; /* rcv_nxt on last window update sent */ u32 snd_nxt; /* Next sequence we send */ u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut * The total number of segments sent. */ u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut * total number of data segments sent. */ u64 bytes_sent; /* RFC4898 tcpEStatsPerfHCDataOctetsOut * total number of data bytes sent. */ u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked * sum(delta(snd_una)), or how many bytes * were acked. */ u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups * total number of DSACK blocks received */ u32 snd_una; /* First byte we want an ack for */ u32 snd_sml; /* Last byte of the most recently transmitted small packet */ u32 rcv_tstamp; /* timestamp of last received ACK (for keepalives) */ u32 lsndtime; /* timestamp of last sent data packet (for restart window) */ u32 last_oow_ack_time; /* timestamp of last out-of-window ACK */ u32 compressed_ack_rcv_nxt; u32 tsoffset; /* timestamp offset */ struct list_head tsq_node; /* anchor in tsq_tasklet.head list */ struct list_head tsorted_sent_queue; /* time-sorted sent but un-SACKed skbs */ u32 snd_wl1; /* Sequence for window update */ u32 snd_wnd; /* The window we expect to receive */ u32 max_window; /* Maximal window ever seen from peer */ u32 mss_cache; /* Cached effective mss, not including SACKS */ u32 window_clamp; /* Maximal window to advertise */ u32 rcv_ssthresh; /* Current window clamp */ /* Information of the most recently (s)acked skb */ struct tcp_rack { u64 mstamp; /* (Re)sent time of the skb */ u32 rtt_us; /* Associated RTT */ u32 end_seq; /* Ending TCP sequence of the skb */ u32 last_delivered; /* tp->delivered at last reo_wnd adj */ u8 reo_wnd_steps; /* Allowed reordering window */ #define TCP_RACK_RECOVERY_THRESH 16 u8 reo_wnd_persist:5, /* No. of recovery since last adj */ dsack_seen:1, /* Whether DSACK seen after last adj */ advanced:1; /* mstamp advanced since last lost marking */ } rack; u16 advmss; /* Advertised MSS */ u8 compressed_ack; u8 dup_ack_counter:2, tlp_retrans:1, /* TLP is a retransmission */ unused:5; u32 chrono_start; /* Start time in jiffies of a TCP chrono */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u8 chrono_type:2, /* current chronograph type */ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ fastopen_client_fail:2; /* reason why fastopen failed */ u8 nonagle : 4,/* Disable Nagle algorithm? */ thin_lto : 1,/* Use linear timeouts for thin streams */ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ repair : 1, frto : 1;/* F-RTO (RFC5682) activated in CA_Loss */ u8 repair_queue; u8 save_syn:2, /* Save headers of SYN packet */ syn_data:1, /* SYN includes data */ syn_fastopen:1, /* SYN includes Fast Open option */ syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */ syn_fastopen_ch:1, /* Active TFO re-enabling probe */ syn_data_acked:1,/* data in SYN is acked by SYN-ACK */ is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */ u32 tlp_high_seq; /* snd_nxt at the time of TLP */ u32 tcp_tx_delay; /* delay (in usec) added to TX packets */ u64 tcp_wstamp_ns; /* departure time for next sent data packet */ u64 tcp_clock_cache; /* cache last tcp_clock_ns() (see tcp_mstamp_refresh()) */ /* RTT measurement */ u64 tcp_mstamp; /* most recent packet received/sent */ u32 srtt_us; /* smoothed round trip time << 3 in usecs */ u32 mdev_us; /* medium deviation */ u32 mdev_max_us; /* maximal mdev for the last rtt period */ u32 rttvar_us; /* smoothed mdev_max */ u32 rtt_seq; /* sequence number to update rttvar */ struct minmax rtt_min; u32 packets_out; /* Packets which are "in flight" */ u32 retrans_out; /* Retransmitted packets out */ u32 max_packets_out; /* max packets_out in last window */ u32 max_packets_seq; /* right edge of max_packets_out flight */ u16 urg_data; /* Saved octet of OOB data and control flags */ u8 ecn_flags; /* ECN status bits. */ u8 keepalive_probes; /* num of allowed keep alive probes */ u32 reordering; /* Packet reordering metric. */ u32 reord_seen; /* number of data packet reordering events */ u32 snd_up; /* Urgent pointer */ /* * Options received (usually on last packet, some only on SYN packets). */ struct tcp_options_received rx_opt; /* * Slow start and congestion control (see also Nagle, and Karn & Partridge) */ u32 snd_ssthresh; /* Slow start size threshold */ u32 snd_cwnd; /* Sending congestion window */ u32 snd_cwnd_cnt; /* Linear increase counter */ u32 snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */ u32 snd_cwnd_used; u32 snd_cwnd_stamp; u32 prior_cwnd; /* cwnd right before starting loss recovery */ u32 prr_delivered; /* Number of newly delivered packets to * receiver in Recovery. */ u32 prr_out; /* Total number of pkts sent during Recovery. */ u32 delivered; /* Total data packets delivered incl. rexmits */ u32 delivered_ce; /* Like the above but only ECE marked packets */ u32 lost; /* Total data packets lost incl. rexmits */ u32 app_limited; /* limited until "delivered" reaches this val */ u64 first_tx_mstamp; /* start of window send phase */ u64 delivered_mstamp; /* time we reached "delivered" */ u32 rate_delivered; /* saved rate sample: packets delivered */ u32 rate_interval_us; /* saved rate sample: time elapsed */ u32 rcv_wnd; /* Current receiver window */ u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ u32 notsent_lowat; /* TCP_NOTSENT_LOWAT */ u32 pushed_seq; /* Last pushed seq, required to talk to windows */ u32 lost_out; /* Lost packets */ u32 sacked_out; /* SACK'd packets */ struct hrtimer pacing_timer; struct hrtimer compressed_ack_timer; /* from STCP, retrans queue hinting */ struct sk_buff* lost_skb_hint; struct sk_buff *retransmit_skb_hint; /* OOO segments go in this rbtree. Socket lock must be held. */ struct rb_root out_of_order_queue; struct sk_buff *ooo_last_skb; /* cache rb_last(out_of_order_queue) */ /* SACKs data, these 2 need to be together (see tcp_options_write) */ struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */ struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/ struct tcp_sack_block recv_sack_cache[4]; struct sk_buff *highest_sack; /* skb just after the highest * skb with SACKed bit set * (validity guaranteed only if * sacked_out > 0) */ int lost_cnt_hint; u32 prior_ssthresh; /* ssthresh saved at recovery start */ u32 high_seq; /* snd_nxt at onset of congestion */ u32 retrans_stamp; /* Timestamp of the last retransmit, * also used in SYN-SENT to remember stamp of * the first SYN. */ u32 undo_marker; /* snd_una upon a new recovery episode. */ int undo_retrans; /* number of undoable retransmissions. */ u64 bytes_retrans; /* RFC4898 tcpEStatsPerfOctetsRetrans * Total data bytes retransmitted */ u32 total_retrans; /* Total retransmits for entire connection */ u32 urg_seq; /* Seq of received urgent pointer */ unsigned int keepalive_time; /* time before keep alive takes place */ unsigned int keepalive_intvl; /* time interval between keep alive probes */ int linger2; /* Sock_ops bpf program related variables */ #ifdef CONFIG_BPF u8 bpf_sock_ops_cb_flags; /* Control calling BPF programs * values defined in uapi/linux/tcp.h */ #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) (TP->bpf_sock_ops_cb_flags & ARG) #else #define BPF_SOCK_OPS_TEST_FLAG(TP, ARG) 0 #endif u16 timeout_rehash; /* Timeout-triggered rehash attempts */ u32 rcv_ooopack; /* Received out-of-order packets, for tcpinfo */ /* Receiver side RTT estimation */ u32 rcv_rtt_last_tsecr; struct { u32 rtt_us; u32 seq; u64 time; } rcv_rtt_est; /* Receiver queue space */ struct { u32 space; u32 seq; u64 time; } rcvq_space; /* TCP-specific MTU probe information. */ struct { u32 probe_seq_start; u32 probe_seq_end; } mtu_probe; u32 mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG * while socket was owned by user. */ #if IS_ENABLED(CONFIG_MPTCP) bool is_mptcp; #endif #if IS_ENABLED(CONFIG_SMC) bool syn_smc; /* SYN includes SMC */ #endif #ifdef CONFIG_TCP_MD5SIG /* TCP AF-Specific parts; only used by MD5 Signature support so far */ const struct tcp_sock_af_ops *af_specific; /* TCP MD5 Signature Option information */ struct tcp_md5sig_info __rcu *md5sig_info; #endif /* TCP fastopen related information */ struct tcp_fastopen_request *fastopen_req; /* fastopen_rsk points to request_sock that resulted in this big * socket. Used to retransmit SYNACKs etc. */ struct request_sock __rcu *fastopen_rsk; struct saved_syn *saved_syn; }; enum tsq_enum { TSQ_THROTTLED, TSQ_QUEUED, TCP_TSQ_DEFERRED, /* tcp_tasklet_func() found socket was owned */ TCP_WRITE_TIMER_DEFERRED, /* tcp_write_timer() found socket was owned */ TCP_DELACK_TIMER_DEFERRED, /* tcp_delack_timer() found socket was owned */ TCP_MTU_REDUCED_DEFERRED, /* tcp_v{4|6}_err() could not call * tcp_v{4|6}_mtu_reduced() */ }; enum tsq_flags { TSQF_THROTTLED = (1UL << TSQ_THROTTLED), TSQF_QUEUED = (1UL << TSQ_QUEUED), TCPF_TSQ_DEFERRED = (1UL << TCP_TSQ_DEFERRED), TCPF_WRITE_TIMER_DEFERRED = (1UL << TCP_WRITE_TIMER_DEFERRED), TCPF_DELACK_TIMER_DEFERRED = (1UL << TCP_DELACK_TIMER_DEFERRED), TCPF_MTU_REDUCED_DEFERRED = (1UL << TCP_MTU_REDUCED_DEFERRED), }; static inline struct tcp_sock *tcp_sk(const struct sock *sk) { return (struct tcp_sock *)sk; } struct tcp_timewait_sock { struct inet_timewait_sock tw_sk; #define tw_rcv_nxt tw_sk.__tw_common.skc_tw_rcv_nxt #define tw_snd_nxt tw_sk.__tw_common.skc_tw_snd_nxt u32 tw_rcv_wnd; u32 tw_ts_offset; u32 tw_ts_recent; /* The time we sent the last out-of-window ACK: */ u32 tw_last_oow_ack_time; int tw_ts_recent_stamp; u32 tw_tx_delay; #ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *tw_md5_key; #endif }; static inline struct tcp_timewait_sock *tcp_twsk(const struct sock *sk) { return (struct tcp_timewait_sock *)sk; } static inline bool tcp_passive_fastopen(const struct sock *sk) { return sk->sk_state == TCP_SYN_RECV && rcu_access_pointer(tcp_sk(sk)->fastopen_rsk) != NULL; } static inline void fastopen_queue_tune(struct sock *sk, int backlog) { struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); queue->fastopenq.max_qlen = min_t(unsigned int, backlog, somaxconn); } static inline void tcp_move_syn(struct tcp_sock *tp, struct request_sock *req) { tp->saved_syn = req->saved_syn; req->saved_syn = NULL; } static inline void tcp_saved_syn_free(struct tcp_sock *tp) { kfree(tp->saved_syn); tp->saved_syn = NULL; } static inline u32 tcp_saved_syn_len(const struct saved_syn *saved_syn) { return saved_syn->mac_hdrlen + saved_syn->network_hdrlen + saved_syn->tcp_hdrlen; } struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, const struct sk_buff *orig_skb); static inline u16 tcp_mss_clamp(const struct tcp_sock *tp, u16 mss) { /* We use READ_ONCE() here because socket might not be locked. * This happens for listeners. */ u16 user_mss = READ_ONCE(tp->rx_opt.user_mss); return (user_mss && user_mss < mss) ? user_mss : mss; } int tcp_skb_shift(struct sk_buff *to, struct sk_buff *from, int pcount, int shiftlen); void tcp_sock_set_cork(struct sock *sk, bool on); int tcp_sock_set_keepcnt(struct sock *sk, int val); int tcp_sock_set_keepidle_locked(struct sock *sk, int val); int tcp_sock_set_keepidle(struct sock *sk, int val); int tcp_sock_set_keepintvl(struct sock *sk, int val); void tcp_sock_set_nodelay(struct sock *sk); void tcp_sock_set_quickack(struct sock *sk, int val); int tcp_sock_set_syncnt(struct sock *sk, int val); void tcp_sock_set_user_timeout(struct sock *sk, u32 val); #endif /* _LINUX_TCP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* delayacct.h - per-task delay accounting * * Copyright (C) Shailabh Nagar, IBM Corp. 2006 */ #ifndef _LINUX_DELAYACCT_H #define _LINUX_DELAYACCT_H #include <uapi/linux/taskstats.h> /* * Per-task flags relevant to delay accounting * maintained privately to avoid exhausting similar flags in sched.h:PF_* * Used to set current->delays->flags */ #define DELAYACCT_PF_SWAPIN 0x00000001 /* I am doing a swapin */ #define DELAYACCT_PF_BLKIO 0x00000002 /* I am waiting on IO */ #ifdef CONFIG_TASK_DELAY_ACCT struct task_delay_info { raw_spinlock_t lock; unsigned int flags; /* Private per-task flags */ /* For each stat XXX, add following, aligned appropriately * * struct timespec XXX_start, XXX_end; * u64 XXX_delay; * u32 XXX_count; * * Atomicity of updates to XXX_delay, XXX_count protected by * single lock above (split into XXX_lock if contention is an issue). */ /* * XXX_count is incremented on every XXX operation, the delay * associated with the operation is added to XXX_delay. * XXX_delay contains the accumulated delay time in nanoseconds. */ u64 blkio_start; /* Shared by blkio, swapin */ u64 blkio_delay; /* wait for sync block io completion */ u64 swapin_delay; /* wait for swapin block io completion */ u32 blkio_count; /* total count of the number of sync block */ /* io operations performed */ u32 swapin_count; /* total count of the number of swapin block */ /* io operations performed */ u64 freepages_start; u64 freepages_delay; /* wait for memory reclaim */ u64 thrashing_start; u64 thrashing_delay; /* wait for thrashing page */ u32 freepages_count; /* total count of memory reclaim */ u32 thrashing_count; /* total count of thrash waits */ }; #endif #include <linux/sched.h> #include <linux/slab.h> #ifdef CONFIG_TASK_DELAY_ACCT extern int delayacct_on; /* Delay accounting turned on/off */ extern struct kmem_cache *delayacct_cache; extern void delayacct_init(void); extern void __delayacct_tsk_init(struct task_struct *); extern void __delayacct_tsk_exit(struct task_struct *); extern void __delayacct_blkio_start(void); extern void __delayacct_blkio_end(struct task_struct *); extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *); extern __u64 __delayacct_blkio_ticks(struct task_struct *); extern void __delayacct_freepages_start(void); extern void __delayacct_freepages_end(void); extern void __delayacct_thrashing_start(void); extern void __delayacct_thrashing_end(void); static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { if (p->delays) return (p->delays->flags & DELAYACCT_PF_BLKIO); else return 0; } static inline void delayacct_set_flag(int flag) { if (current->delays) current->delays->flags |= flag; } static inline void delayacct_clear_flag(int flag) { if (current->delays) current->delays->flags &= ~flag; } static inline void delayacct_tsk_init(struct task_struct *tsk) { /* reinitialize in case parent's non-null pointer was dup'ed*/ tsk->delays = NULL; if (delayacct_on) __delayacct_tsk_init(tsk); } /* Free tsk->delays. Called from bad fork and __put_task_struct * where there's no risk of tsk->delays being accessed elsewhere */ static inline void delayacct_tsk_free(struct task_struct *tsk) { if (tsk->delays) kmem_cache_free(delayacct_cache, tsk->delays); tsk->delays = NULL; } static inline void delayacct_blkio_start(void) { delayacct_set_flag(DELAYACCT_PF_BLKIO); if (current->delays) __delayacct_blkio_start(); } static inline void delayacct_blkio_end(struct task_struct *p) { if (p->delays) __delayacct_blkio_end(p); delayacct_clear_flag(DELAYACCT_PF_BLKIO); } static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { if (!delayacct_on || !tsk->delays) return 0; return __delayacct_add_tsk(d, tsk); } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { if (tsk->delays) return __delayacct_blkio_ticks(tsk); return 0; } static inline void delayacct_freepages_start(void) { if (current->delays) __delayacct_freepages_start(); } static inline void delayacct_freepages_end(void) { if (current->delays) __delayacct_freepages_end(); } static inline void delayacct_thrashing_start(void) { if (current->delays) __delayacct_thrashing_start(); } static inline void delayacct_thrashing_end(void) { if (current->delays) __delayacct_thrashing_end(); } #else static inline void delayacct_set_flag(int flag) {} static inline void delayacct_clear_flag(int flag) {} static inline void delayacct_init(void) {} static inline void delayacct_tsk_init(struct task_struct *tsk) {} static inline void delayacct_tsk_free(struct task_struct *tsk) {} static inline void delayacct_blkio_start(void) {} static inline void delayacct_blkio_end(struct task_struct *p) {} static inline int delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) { return 0; } static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk) { return 0; } static inline int delayacct_is_task_waiting_on_io(struct task_struct *p) { return 0; } static inline void delayacct_freepages_start(void) {} static inline void delayacct_freepages_end(void) {} static inline void delayacct_thrashing_start(void) {} static inline void delayacct_thrashing_end(void) {} #endif /* CONFIG_TASK_DELAY_ACCT */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H #include <linux/sched/coredump.h> #include <linux/mm_types.h> #include <linux/fs.h> /* only for vma_is_dax() */ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf); int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma); void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm, pud_t *dst_pud, pud_t *src_pud, unsigned long addr, struct vm_area_struct *vma); #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud); #else static inline void huge_pud_set_accessed(struct vm_fault *vmf, pud_t orig_pud) { } #endif vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, unsigned int flags); bool madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long next); int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr); int zap_huge_pud(struct mmu_gather *tlb, struct vm_area_struct *vma, pud_t *pud, unsigned long addr); bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd); int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, unsigned long cp_flags); vm_fault_t vmf_insert_pfn_pmd_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pmd - insert a pmd size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pmd size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pmd(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pmd_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } vm_fault_t vmf_insert_pfn_pud_prot(struct vm_fault *vmf, pfn_t pfn, pgprot_t pgprot, bool write); /** * vmf_insert_pfn_pud - insert a pud size pfn * @vmf: Structure describing the fault * @pfn: pfn to insert * @pgprot: page protection to use * @write: whether it's a write fault * * Insert a pud size pfn. See vmf_insert_pfn() for additional info. * * Return: vm_fault_t value. */ static inline vm_fault_t vmf_insert_pfn_pud(struct vm_fault *vmf, pfn_t pfn, bool write) { return vmf_insert_pfn_pud_prot(vmf, pfn, vmf->vma->vm_page_prot, write); } enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_NEVER_DAX, TRANSPARENT_HUGEPAGE_FLAG, TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG, #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif }; struct kobject; struct kobj_attribute; ssize_t single_hugepage_flag_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count, enum transparent_hugepage_flag flag); ssize_t single_hugepage_flag_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, enum transparent_hugepage_flag flag); extern struct kobj_attribute shmem_enabled_attr; #define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) #define HPAGE_PMD_NR (1<<HPAGE_PMD_ORDER) #ifdef CONFIG_TRANSPARENT_HUGEPAGE #define HPAGE_PMD_SHIFT PMD_SHIFT #define HPAGE_PMD_SIZE ((1UL) << HPAGE_PMD_SHIFT) #define HPAGE_PMD_MASK (~(HPAGE_PMD_SIZE - 1)) #define HPAGE_PUD_SHIFT PUD_SHIFT #define HPAGE_PUD_SIZE ((1UL) << HPAGE_PUD_SHIFT) #define HPAGE_PUD_MASK (~(HPAGE_PUD_SIZE - 1)) extern unsigned long transparent_hugepage_flags; static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { /* Don't have to check pgoff for anonymous vma */ if (!vma_is_anonymous(vma)) { if (!IS_ALIGNED((vma->vm_start >> PAGE_SHIFT) - vma->vm_pgoff, HPAGE_PMD_NR)) return false; } if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return false; return true; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { /* Explicitly disabled through madvise. */ if ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags)) return false; return true; } /* * to be used on vmas which are known to support THP. * Use transparent_hugepage_active otherwise */ static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { /* * If the hardware/firmware marked hugepage support disabled. */ if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_NEVER_DAX)) return false; if (!transhuge_vma_enabled(vma, vma->vm_flags)) return false; if (vma_is_temporary_stack(vma)) return false; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_FLAG)) return true; if (vma_is_dax(vma)) return true; if (transparent_hugepage_flags & (1 << TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG)) return !!(vma->vm_flags & VM_HUGEPAGE); return false; } bool transparent_hugepage_active(struct vm_area_struct *vma); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ (1<<TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG)) unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); void prep_transhuge_page(struct page *page); void free_transhuge_page(struct page *page); bool is_transparent_hugepage(struct page *page); bool can_split_huge_page(struct page *page, int *pextra_pins); int split_huge_page_to_list(struct page *page, struct list_head *list); static inline int split_huge_page(struct page *page) { return split_huge_page_to_list(page, NULL); } void deferred_split_huge_page(struct page *page); void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page); #define split_huge_pmd(__vma, __pmd, __address) \ do { \ pmd_t *____pmd = (__pmd); \ if (is_swap_pmd(*____pmd) || pmd_trans_huge(*____pmd) \ || pmd_devmap(*____pmd)) \ __split_huge_pmd(__vma, __pmd, __address, \ false, NULL); \ } while (0) void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page); void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, unsigned long address); #define split_huge_pud(__vma, __pud, __address) \ do { \ pud_t *____pud = (__pud); \ if (pud_trans_huge(*____pud) \ || pud_devmap(*____pud)) \ __split_huge_pud(__vma, __pud, __address); \ } while (0) int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); spinlock_t *__pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma); static inline int is_swap_pmd(pmd_t pmd) { return !pmd_none(pmd) && !pmd_present(pmd); } /* mmap_lock must be held on entry */ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { if (is_swap_pmd(*pmd) || pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) return __pmd_trans_huge_lock(pmd, vma); else return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { if (pud_trans_huge(*pud) || pud_devmap(*pud)) return __pud_trans_huge_lock(pud, vma); else return NULL; } /** * thp_head - Head page of a transparent huge page. * @page: Any page (tail, head or regular) found in the page cache. */ static inline struct page *thp_head(struct page *page) { return compound_head(page); } /** * thp_order - Order of a transparent huge page. * @page: Head page of a transparent huge page. */ static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_ORDER; return 0; } /** * thp_nr_pages - The number of regular pages in this huge page. * @page: The head page of a huge page. */ static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); if (PageHead(page)) return HPAGE_PMD_NR; return 1; } struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap); struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap); vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; extern unsigned long huge_zero_pfn; static inline bool is_huge_zero_page(struct page *page) { return READ_ONCE(huge_zero_page) == page; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return READ_ONCE(huge_zero_pfn) == pmd_pfn(pmd) && pmd_present(pmd); } static inline bool is_huge_zero_pud(pud_t pud) { return false; } struct page *mm_get_huge_zero_page(struct mm_struct *mm); void mm_put_huge_zero_page(struct mm_struct *mm); #define mk_huge_pmd(page, prot) pmd_mkhuge(mk_pmd(page, prot)) static inline bool thp_migration_supported(void) { return IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION); } static inline struct list_head *page_deferred_list(struct page *page) { /* * Global or memcg deferred list in the second tail pages is * occupied by compound_head. */ return &page[2].deferred_list; } #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SHIFT ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_MASK ({ BUILD_BUG(); 0; }) #define HPAGE_PUD_SIZE ({ BUILD_BUG(); 0; }) static inline struct page *thp_head(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return page; } static inline unsigned int thp_order(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 0; } static inline int thp_nr_pages(struct page *page) { VM_BUG_ON_PGFLAGS(PageTail(page), page); return 1; } static inline bool __transparent_hugepage_enabled(struct vm_area_struct *vma) { return false; } static inline bool transparent_hugepage_active(struct vm_area_struct *vma) { return false; } static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, unsigned long haddr) { return false; } static inline bool transhuge_vma_enabled(struct vm_area_struct *vma, unsigned long vm_flags) { return false; } static inline void prep_transhuge_page(struct page *page) {} static inline bool is_transparent_hugepage(struct page *page) { return false; } #define transparent_hugepage_flags 0UL #define thp_get_unmapped_area NULL static inline bool can_split_huge_page(struct page *page, int *pextra_pins) { BUILD_BUG(); return false; } static inline int split_huge_page_to_list(struct page *page, struct list_head *list) { return 0; } static inline int split_huge_page(struct page *page) { return 0; } static inline void deferred_split_huge_page(struct page *page) {} #define split_huge_pmd(__vma, __pmd, __address) \ do { } while (0) static inline void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long address, bool freeze, struct page *page) {} static inline void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct page *page) {} #define split_huge_pud(__vma, __pmd, __address) \ do { } while (0) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { BUG(); return 0; } static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next) { } static inline int is_swap_pmd(pmd_t pmd) { return 0; } static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { return NULL; } static inline spinlock_t *pud_trans_huge_lock(pud_t *pud, struct vm_area_struct *vma) { return NULL; } static inline vm_fault_t do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) { return 0; } static inline bool is_huge_zero_page(struct page *page) { return false; } static inline bool is_huge_zero_pmd(pmd_t pmd) { return false; } static inline bool is_huge_zero_pud(pud_t pud) { return false; } static inline void mm_put_huge_zero_page(struct mm_struct *mm) { return; } static inline struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr, pud_t *pud, int flags, struct dev_pagemap **pgmap) { return NULL; } static inline bool thp_migration_supported(void) { return false; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /** * thp_size - Size of a transparent huge page. * @page: Head page of a transparent huge page. * * Return: Number of bytes in this page. */ static inline unsigned long thp_size(struct page *page) { return PAGE_SIZE << thp_order(page); } #endif /* _LINUX_HUGE_MM_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Copyright 2003-2005 Red Hat, Inc. All rights reserved. * Copyright 2003-2005 Jeff Garzik * * libata documentation is available via 'make {ps|pdf}docs', * as Documentation/driver-api/libata.rst */ #ifndef __LINUX_LIBATA_H__ #define __LINUX_LIBATA_H__ #include <linux/delay.h> #include <linux/jiffies.h> #include <linux/interrupt.h> #include <linux/dma-mapping.h> #include <linux/scatterlist.h> #include <linux/io.h> #include <linux/ata.h> #include <linux/workqueue.h> #include <scsi/scsi_host.h> #include <linux/acpi.h> #include <linux/cdrom.h> #include <linux/sched.h> #include <linux/async.h> /* * Define if arch has non-standard setup. This is a _PCI_ standard * not a legacy or ISA standard. */ #ifdef CONFIG_ATA_NONSTANDARD #include <asm/libata-portmap.h> #else #define ATA_PRIMARY_IRQ(dev) 14 #define ATA_SECONDARY_IRQ(dev) 15 #endif /* * compile-time options: to be removed as soon as all the drivers are * converted to the new debugging mechanism */ #undef ATA_DEBUG /* debugging output */ #undef ATA_VERBOSE_DEBUG /* yet more debugging output */ #undef ATA_IRQ_TRAP /* define to ack screaming irqs */ #undef ATA_NDEBUG /* define to disable quick runtime checks */ /* note: prints function name for you */ #ifdef ATA_DEBUG #define DPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) #ifdef ATA_VERBOSE_DEBUG #define VPRINTK(fmt, args...) printk(KERN_ERR "%s: " fmt, __func__, ## args) #else #define VPRINTK(fmt, args...) #endif /* ATA_VERBOSE_DEBUG */ #else #define DPRINTK(fmt, args...) #define VPRINTK(fmt, args...) #endif /* ATA_DEBUG */ #define ata_print_version_once(dev, version) \ ({ \ static bool __print_once; \ \ if (!__print_once) { \ __print_once = true; \ ata_print_version(dev, version); \ } \ }) /* NEW: debug levels */ #define HAVE_LIBATA_MSG 1 enum { ATA_MSG_DRV = 0x0001, ATA_MSG_INFO = 0x0002, ATA_MSG_PROBE = 0x0004, ATA_MSG_WARN = 0x0008, ATA_MSG_MALLOC = 0x0010, ATA_MSG_CTL = 0x0020, ATA_MSG_INTR = 0x0040, ATA_MSG_ERR = 0x0080, }; #define ata_msg_drv(p) ((p)->msg_enable & ATA_MSG_DRV) #define ata_msg_info(p) ((p)->msg_enable & ATA_MSG_INFO) #define ata_msg_probe(p) ((p)->msg_enable & ATA_MSG_PROBE) #define ata_msg_warn(p) ((p)->msg_enable & ATA_MSG_WARN) #define ata_msg_malloc(p) ((p)->msg_enable & ATA_MSG_MALLOC) #define ata_msg_ctl(p) ((p)->msg_enable & ATA_MSG_CTL) #define ata_msg_intr(p) ((p)->msg_enable & ATA_MSG_INTR) #define ata_msg_err(p) ((p)->msg_enable & ATA_MSG_ERR) static inline u32 ata_msg_init(int dval, int default_msg_enable_bits) { if (dval < 0 || dval >= (sizeof(u32) * 8)) return default_msg_enable_bits; /* should be 0x1 - only driver info msgs */ if (!dval) return 0; return (1 << dval) - 1; } /* defines only for the constants which don't work well as enums */ #define ATA_TAG_POISON 0xfafbfcfdU enum { /* various global constants */ LIBATA_MAX_PRD = ATA_MAX_PRD / 2, LIBATA_DUMB_MAX_PRD = ATA_MAX_PRD / 4, /* Worst case */ ATA_DEF_QUEUE = 1, ATA_MAX_QUEUE = 32, ATA_TAG_INTERNAL = ATA_MAX_QUEUE, ATA_SHORT_PAUSE = 16, ATAPI_MAX_DRAIN = 16 << 10, ATA_ALL_DEVICES = (1 << ATA_MAX_DEVICES) - 1, ATA_SHT_EMULATED = 1, ATA_SHT_THIS_ID = -1, /* struct ata_taskfile flags */ ATA_TFLAG_LBA48 = (1 << 0), /* enable 48-bit LBA and "HOB" */ ATA_TFLAG_ISADDR = (1 << 1), /* enable r/w to nsect/lba regs */ ATA_TFLAG_DEVICE = (1 << 2), /* enable r/w to device reg */ ATA_TFLAG_WRITE = (1 << 3), /* data dir: host->dev==1 (write) */ ATA_TFLAG_LBA = (1 << 4), /* enable LBA */ ATA_TFLAG_FUA = (1 << 5), /* enable FUA */ ATA_TFLAG_POLLING = (1 << 6), /* set nIEN to 1 and use polling */ /* struct ata_device stuff */ ATA_DFLAG_LBA = (1 << 0), /* device supports LBA */ ATA_DFLAG_LBA48 = (1 << 1), /* device supports LBA48 */ ATA_DFLAG_CDB_INTR = (1 << 2), /* device asserts INTRQ when ready for CDB */ ATA_DFLAG_NCQ = (1 << 3), /* device supports NCQ */ ATA_DFLAG_FLUSH_EXT = (1 << 4), /* do FLUSH_EXT instead of FLUSH */ ATA_DFLAG_ACPI_PENDING = (1 << 5), /* ACPI resume action pending */ ATA_DFLAG_ACPI_FAILED = (1 << 6), /* ACPI on devcfg has failed */ ATA_DFLAG_AN = (1 << 7), /* AN configured */ ATA_DFLAG_TRUSTED = (1 << 8), /* device supports trusted send/recv */ ATA_DFLAG_DMADIR = (1 << 10), /* device requires DMADIR */ ATA_DFLAG_CFG_MASK = (1 << 12) - 1, ATA_DFLAG_PIO = (1 << 12), /* device limited to PIO mode */ ATA_DFLAG_NCQ_OFF = (1 << 13), /* device limited to non-NCQ mode */ ATA_DFLAG_SLEEPING = (1 << 15), /* device is sleeping */ ATA_DFLAG_DUBIOUS_XFER = (1 << 16), /* data transfer not verified */ ATA_DFLAG_NO_UNLOAD = (1 << 17), /* device doesn't support unload */ ATA_DFLAG_UNLOCK_HPA = (1 << 18), /* unlock HPA */ ATA_DFLAG_NCQ_SEND_RECV = (1 << 19), /* device supports NCQ SEND and RECV */ ATA_DFLAG_NCQ_PRIO = (1 << 20), /* device supports NCQ priority */ ATA_DFLAG_NCQ_PRIO_ENABLE = (1 << 21), /* Priority cmds sent to dev */ ATA_DFLAG_INIT_MASK = (1 << 24) - 1, ATA_DFLAG_DETACH = (1 << 24), ATA_DFLAG_DETACHED = (1 << 25), ATA_DFLAG_DA = (1 << 26), /* device supports Device Attention */ ATA_DFLAG_DEVSLP = (1 << 27), /* device supports Device Sleep */ ATA_DFLAG_ACPI_DISABLED = (1 << 28), /* ACPI for the device is disabled */ ATA_DFLAG_D_SENSE = (1 << 29), /* Descriptor sense requested */ ATA_DFLAG_ZAC = (1 << 30), /* ZAC device */ ATA_DEV_UNKNOWN = 0, /* unknown device */ ATA_DEV_ATA = 1, /* ATA device */ ATA_DEV_ATA_UNSUP = 2, /* ATA device (unsupported) */ ATA_DEV_ATAPI = 3, /* ATAPI device */ ATA_DEV_ATAPI_UNSUP = 4, /* ATAPI device (unsupported) */ ATA_DEV_PMP = 5, /* SATA port multiplier */ ATA_DEV_PMP_UNSUP = 6, /* SATA port multiplier (unsupported) */ ATA_DEV_SEMB = 7, /* SEMB */ ATA_DEV_SEMB_UNSUP = 8, /* SEMB (unsupported) */ ATA_DEV_ZAC = 9, /* ZAC device */ ATA_DEV_ZAC_UNSUP = 10, /* ZAC device (unsupported) */ ATA_DEV_NONE = 11, /* no device */ /* struct ata_link flags */ /* NOTE: struct ata_force_param currently stores lflags in u16 */ ATA_LFLAG_NO_HRST = (1 << 1), /* avoid hardreset */ ATA_LFLAG_NO_SRST = (1 << 2), /* avoid softreset */ ATA_LFLAG_ASSUME_ATA = (1 << 3), /* assume ATA class */ ATA_LFLAG_ASSUME_SEMB = (1 << 4), /* assume SEMB class */ ATA_LFLAG_ASSUME_CLASS = ATA_LFLAG_ASSUME_ATA | ATA_LFLAG_ASSUME_SEMB, ATA_LFLAG_NO_RETRY = (1 << 5), /* don't retry this link */ ATA_LFLAG_DISABLED = (1 << 6), /* link is disabled */ ATA_LFLAG_SW_ACTIVITY = (1 << 7), /* keep activity stats */ ATA_LFLAG_NO_LPM = (1 << 8), /* disable LPM on this link */ ATA_LFLAG_RST_ONCE = (1 << 9), /* limit recovery to one reset */ ATA_LFLAG_CHANGED = (1 << 10), /* LPM state changed on this link */ ATA_LFLAG_NO_DB_DELAY = (1 << 11), /* no debounce delay on link resume */ /* struct ata_port flags */ ATA_FLAG_SLAVE_POSS = (1 << 0), /* host supports slave dev */ /* (doesn't imply presence) */ ATA_FLAG_SATA = (1 << 1), ATA_FLAG_NO_LPM = (1 << 2), /* host not happy with LPM */ ATA_FLAG_NO_LOG_PAGE = (1 << 5), /* do not issue log page read */ ATA_FLAG_NO_ATAPI = (1 << 6), /* No ATAPI support */ ATA_FLAG_PIO_DMA = (1 << 7), /* PIO cmds via DMA */ ATA_FLAG_PIO_LBA48 = (1 << 8), /* Host DMA engine is LBA28 only */ ATA_FLAG_PIO_POLLING = (1 << 9), /* use polling PIO if LLD * doesn't handle PIO interrupts */ ATA_FLAG_NCQ = (1 << 10), /* host supports NCQ */ ATA_FLAG_NO_POWEROFF_SPINDOWN = (1 << 11), /* don't spindown before poweroff */ ATA_FLAG_NO_HIBERNATE_SPINDOWN = (1 << 12), /* don't spindown before hibernation */ ATA_FLAG_DEBUGMSG = (1 << 13), ATA_FLAG_FPDMA_AA = (1 << 14), /* driver supports Auto-Activate */ ATA_FLAG_IGN_SIMPLEX = (1 << 15), /* ignore SIMPLEX */ ATA_FLAG_NO_IORDY = (1 << 16), /* controller lacks iordy */ ATA_FLAG_ACPI_SATA = (1 << 17), /* need native SATA ACPI layout */ ATA_FLAG_AN = (1 << 18), /* controller supports AN */ ATA_FLAG_PMP = (1 << 19), /* controller supports PMP */ ATA_FLAG_FPDMA_AUX = (1 << 20), /* controller supports H2DFIS aux field */ ATA_FLAG_EM = (1 << 21), /* driver supports enclosure * management */ ATA_FLAG_SW_ACTIVITY = (1 << 22), /* driver supports sw activity * led */ ATA_FLAG_NO_DIPM = (1 << 23), /* host not happy with DIPM */ ATA_FLAG_SAS_HOST = (1 << 24), /* SAS host */ /* bits 24:31 of ap->flags are reserved for LLD specific flags */ /* struct ata_port pflags */ ATA_PFLAG_EH_PENDING = (1 << 0), /* EH pending */ ATA_PFLAG_EH_IN_PROGRESS = (1 << 1), /* EH in progress */ ATA_PFLAG_FROZEN = (1 << 2), /* port is frozen */ ATA_PFLAG_RECOVERED = (1 << 3), /* recovery action performed */ ATA_PFLAG_LOADING = (1 << 4), /* boot/loading probe */ ATA_PFLAG_SCSI_HOTPLUG = (1 << 6), /* SCSI hotplug scheduled */ ATA_PFLAG_INITIALIZING = (1 << 7), /* being initialized, don't touch */ ATA_PFLAG_RESETTING = (1 << 8), /* reset in progress */ ATA_PFLAG_UNLOADING = (1 << 9), /* driver is being unloaded */ ATA_PFLAG_UNLOADED = (1 << 10), /* driver is unloaded */ ATA_PFLAG_SUSPENDED = (1 << 17), /* port is suspended (power) */ ATA_PFLAG_PM_PENDING = (1 << 18), /* PM operation pending */ ATA_PFLAG_INIT_GTM_VALID = (1 << 19), /* initial gtm data valid */ ATA_PFLAG_PIO32 = (1 << 20), /* 32bit PIO */ ATA_PFLAG_PIO32CHANGE = (1 << 21), /* 32bit PIO can be turned on/off */ ATA_PFLAG_EXTERNAL = (1 << 22), /* eSATA/external port */ /* struct ata_queued_cmd flags */ ATA_QCFLAG_ACTIVE = (1 << 0), /* cmd not yet ack'd to scsi lyer */ ATA_QCFLAG_DMAMAP = (1 << 1), /* SG table is DMA mapped */ ATA_QCFLAG_IO = (1 << 3), /* standard IO command */ ATA_QCFLAG_RESULT_TF = (1 << 4), /* result TF requested */ ATA_QCFLAG_CLEAR_EXCL = (1 << 5), /* clear excl_link on completion */ ATA_QCFLAG_QUIET = (1 << 6), /* don't report device error */ ATA_QCFLAG_RETRY = (1 << 7), /* retry after failure */ ATA_QCFLAG_FAILED = (1 << 16), /* cmd failed and is owned by EH */ ATA_QCFLAG_SENSE_VALID = (1 << 17), /* sense data valid */ ATA_QCFLAG_EH_SCHEDULED = (1 << 18), /* EH scheduled (obsolete) */ /* host set flags */ ATA_HOST_SIMPLEX = (1 << 0), /* Host is simplex, one DMA channel per host only */ ATA_HOST_STARTED = (1 << 1), /* Host started */ ATA_HOST_PARALLEL_SCAN = (1 << 2), /* Ports on this host can be scanned in parallel */ ATA_HOST_IGNORE_ATA = (1 << 3), /* Ignore ATA devices on this host. */ /* bits 24:31 of host->flags are reserved for LLD specific flags */ /* various lengths of time */ ATA_TMOUT_BOOT = 30000, /* heuristic */ ATA_TMOUT_BOOT_QUICK = 7000, /* heuristic */ ATA_TMOUT_INTERNAL_QUICK = 5000, ATA_TMOUT_MAX_PARK = 30000, /* * GoVault needs 2s and iVDR disk HHD424020F7SV00 800ms. 2s * is too much without parallel probing. Use 2s if parallel * probing is available, 800ms otherwise. */ ATA_TMOUT_FF_WAIT_LONG = 2000, ATA_TMOUT_FF_WAIT = 800, /* Spec mandates to wait for ">= 2ms" before checking status * after reset. We wait 150ms, because that was the magic * delay used for ATAPI devices in Hale Landis's ATADRVR, for * the period of time between when the ATA command register is * written, and then status is checked. Because waiting for * "a while" before checking status is fine, post SRST, we * perform this magic delay here as well. * * Old drivers/ide uses the 2mS rule and then waits for ready. */ ATA_WAIT_AFTER_RESET = 150, /* If PMP is supported, we have to do follow-up SRST. As some * PMPs don't send D2H Reg FIS after hardreset, LLDs are * advised to wait only for the following duration before * doing SRST. */ ATA_TMOUT_PMP_SRST_WAIT = 5000, /* When the LPM policy is set to ATA_LPM_MAX_POWER, there might * be a spurious PHY event, so ignore the first PHY event that * occurs within 10s after the policy change. */ ATA_TMOUT_SPURIOUS_PHY = 10000, /* ATA bus states */ BUS_UNKNOWN = 0, BUS_DMA = 1, BUS_IDLE = 2, BUS_NOINTR = 3, BUS_NODATA = 4, BUS_TIMER = 5, BUS_PIO = 6, BUS_EDD = 7, BUS_IDENTIFY = 8, BUS_PACKET = 9, /* SATA port states */ PORT_UNKNOWN = 0, PORT_ENABLED = 1, PORT_DISABLED = 2, /* encoding various smaller bitmaps into a single * unsigned long bitmap */ ATA_NR_PIO_MODES = 7, ATA_NR_MWDMA_MODES = 5, ATA_NR_UDMA_MODES = 8, ATA_SHIFT_PIO = 0, ATA_SHIFT_MWDMA = ATA_SHIFT_PIO + ATA_NR_PIO_MODES, ATA_SHIFT_UDMA = ATA_SHIFT_MWDMA + ATA_NR_MWDMA_MODES, ATA_SHIFT_PRIO = 6, ATA_PRIO_HIGH = 2, /* size of buffer to pad xfers ending on unaligned boundaries */ ATA_DMA_PAD_SZ = 4, /* ering size */ ATA_ERING_SIZE = 32, /* return values for ->qc_defer */ ATA_DEFER_LINK = 1, ATA_DEFER_PORT = 2, /* desc_len for ata_eh_info and context */ ATA_EH_DESC_LEN = 80, /* reset / recovery action types */ ATA_EH_REVALIDATE = (1 << 0), ATA_EH_SOFTRESET = (1 << 1), /* meaningful only in ->prereset */ ATA_EH_HARDRESET = (1 << 2), /* meaningful only in ->prereset */ ATA_EH_RESET = ATA_EH_SOFTRESET | ATA_EH_HARDRESET, ATA_EH_ENABLE_LINK = (1 << 3), ATA_EH_PARK = (1 << 5), /* unload heads and stop I/O */ ATA_EH_PERDEV_MASK = ATA_EH_REVALIDATE | ATA_EH_PARK, ATA_EH_ALL_ACTIONS = ATA_EH_REVALIDATE | ATA_EH_RESET | ATA_EH_ENABLE_LINK, /* ata_eh_info->flags */ ATA_EHI_HOTPLUGGED = (1 << 0), /* could have been hotplugged */ ATA_EHI_NO_AUTOPSY = (1 << 2), /* no autopsy */ ATA_EHI_QUIET = (1 << 3), /* be quiet */ ATA_EHI_NO_RECOVERY = (1 << 4), /* no recovery */ ATA_EHI_DID_SOFTRESET = (1 << 16), /* already soft-reset this port */ ATA_EHI_DID_HARDRESET = (1 << 17), /* already soft-reset this port */ ATA_EHI_PRINTINFO = (1 << 18), /* print configuration info */ ATA_EHI_SETMODE = (1 << 19), /* configure transfer mode */ ATA_EHI_POST_SETMODE = (1 << 20), /* revalidating after setmode */ ATA_EHI_DID_RESET = ATA_EHI_DID_SOFTRESET | ATA_EHI_DID_HARDRESET, /* mask of flags to transfer *to* the slave link */ ATA_EHI_TO_SLAVE_MASK = ATA_EHI_NO_AUTOPSY | ATA_EHI_QUIET, /* max tries if error condition is still set after ->error_handler */ ATA_EH_MAX_TRIES = 5, /* sometimes resuming a link requires several retries */ ATA_LINK_RESUME_TRIES = 5, /* how hard are we gonna try to probe/recover devices */ ATA_PROBE_MAX_TRIES = 3, ATA_EH_DEV_TRIES = 3, ATA_EH_PMP_TRIES = 5, ATA_EH_PMP_LINK_TRIES = 3, SATA_PMP_RW_TIMEOUT = 3000, /* PMP read/write timeout */ /* This should match the actual table size of * ata_eh_cmd_timeout_table in libata-eh.c. */ ATA_EH_CMD_TIMEOUT_TABLE_SIZE = 7, /* Horkage types. May be set by libata or controller on drives (some horkage may be drive/controller pair dependent */ ATA_HORKAGE_DIAGNOSTIC = (1 << 0), /* Failed boot diag */ ATA_HORKAGE_NODMA = (1 << 1), /* DMA problems */ ATA_HORKAGE_NONCQ = (1 << 2), /* Don't use NCQ */ ATA_HORKAGE_MAX_SEC_128 = (1 << 3), /* Limit max sects to 128 */ ATA_HORKAGE_BROKEN_HPA = (1 << 4), /* Broken HPA */ ATA_HORKAGE_DISABLE = (1 << 5), /* Disable it */ ATA_HORKAGE_HPA_SIZE = (1 << 6), /* native size off by one */ ATA_HORKAGE_IVB = (1 << 8), /* cbl det validity bit bugs */ ATA_HORKAGE_STUCK_ERR = (1 << 9), /* stuck ERR on next PACKET */ ATA_HORKAGE_BRIDGE_OK = (1 << 10), /* no bridge limits */ ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands not multiple of 16 bytes */ ATA_HORKAGE_FIRMWARE_WARN = (1 << 12), /* firmware update warning */ ATA_HORKAGE_1_5_GBPS = (1 << 13), /* force 1.5 Gbps */ ATA_HORKAGE_NOSETXFER = (1 << 14), /* skip SETXFER, SATA only */ ATA_HORKAGE_BROKEN_FPDMA_AA = (1 << 15), /* skip AA */ ATA_HORKAGE_DUMP_ID = (1 << 16), /* dump IDENTIFY data */ ATA_HORKAGE_MAX_SEC_LBA48 = (1 << 17), /* Set max sects to 65535 */ ATA_HORKAGE_ATAPI_DMADIR = (1 << 18), /* device requires dmadir */ ATA_HORKAGE_NO_NCQ_TRIM = (1 << 19), /* don't use queued TRIM */ ATA_HORKAGE_NOLPM = (1 << 20), /* don't use LPM */ ATA_HORKAGE_WD_BROKEN_LPM = (1 << 21), /* some WDs have broken LPM */ ATA_HORKAGE_ZERO_AFTER_TRIM = (1 << 22),/* guarantees zero after trim */ ATA_HORKAGE_NO_DMA_LOG = (1 << 23), /* don't use DMA for log read */ ATA_HORKAGE_NOTRIM = (1 << 24), /* don't use TRIM */ ATA_HORKAGE_MAX_SEC_1024 = (1 << 25), /* Limit max sects to 1024 */ ATA_HORKAGE_MAX_TRIM_128M = (1 << 26), /* Limit max trim size to 128M */ ATA_HORKAGE_NO_NCQ_ON_ATI = (1 << 27), /* Disable NCQ on ATI chipset */ /* DMA mask for user DMA control: User visible values; DO NOT renumber */ ATA_DMA_MASK_ATA = (1 << 0), /* DMA on ATA Disk */ ATA_DMA_MASK_ATAPI = (1 << 1), /* DMA on ATAPI */ ATA_DMA_MASK_CFA = (1 << 2), /* DMA on CF Card */ /* ATAPI command types */ ATAPI_READ = 0, /* READs */ ATAPI_WRITE = 1, /* WRITEs */ ATAPI_READ_CD = 2, /* READ CD [MSF] */ ATAPI_PASS_THRU = 3, /* SAT pass-thru */ ATAPI_MISC = 4, /* the rest */ /* Timing constants */ ATA_TIMING_SETUP = (1 << 0), ATA_TIMING_ACT8B = (1 << 1), ATA_TIMING_REC8B = (1 << 2), ATA_TIMING_CYC8B = (1 << 3), ATA_TIMING_8BIT = ATA_TIMING_ACT8B | ATA_TIMING_REC8B | ATA_TIMING_CYC8B, ATA_TIMING_ACTIVE = (1 << 4), ATA_TIMING_RECOVER = (1 << 5), ATA_TIMING_DMACK_HOLD = (1 << 6), ATA_TIMING_CYCLE = (1 << 7), ATA_TIMING_UDMA = (1 << 8), ATA_TIMING_ALL = ATA_TIMING_SETUP | ATA_TIMING_ACT8B | ATA_TIMING_REC8B | ATA_TIMING_CYC8B | ATA_TIMING_ACTIVE | ATA_TIMING_RECOVER | ATA_TIMING_DMACK_HOLD | ATA_TIMING_CYCLE | ATA_TIMING_UDMA, /* ACPI constants */ ATA_ACPI_FILTER_SETXFER = 1 << 0, ATA_ACPI_FILTER_LOCK = 1 << 1, ATA_ACPI_FILTER_DIPM = 1 << 2, ATA_ACPI_FILTER_FPDMA_OFFSET = 1 << 3, /* FPDMA non-zero offset */ ATA_ACPI_FILTER_FPDMA_AA = 1 << 4, /* FPDMA auto activate */ ATA_ACPI_FILTER_DEFAULT = ATA_ACPI_FILTER_SETXFER | ATA_ACPI_FILTER_LOCK | ATA_ACPI_FILTER_DIPM, }; enum ata_xfer_mask { ATA_MASK_PIO = ((1LU << ATA_NR_PIO_MODES) - 1) << ATA_SHIFT_PIO, ATA_MASK_MWDMA = ((1LU << ATA_NR_MWDMA_MODES) - 1) << ATA_SHIFT_MWDMA, ATA_MASK_UDMA = ((1LU << ATA_NR_UDMA_MODES) - 1) << ATA_SHIFT_UDMA, }; enum hsm_task_states { HSM_ST_IDLE, /* no command on going */ HSM_ST_FIRST, /* (waiting the device to) write CDB or first data block */ HSM_ST, /* (waiting the device to) transfer data */ HSM_ST_LAST, /* (waiting the device to) complete command */ HSM_ST_ERR, /* error */ }; enum ata_completion_errors { AC_ERR_OK = 0, /* no error */ AC_ERR_DEV = (1 << 0), /* device reported error */ AC_ERR_HSM = (1 << 1), /* host state machine violation */ AC_ERR_TIMEOUT = (1 << 2), /* timeout */ AC_ERR_MEDIA = (1 << 3), /* media error */ AC_ERR_ATA_BUS = (1 << 4), /* ATA bus error */ AC_ERR_HOST_BUS = (1 << 5), /* host bus error */ AC_ERR_SYSTEM = (1 << 6), /* system error */ AC_ERR_INVALID = (1 << 7), /* invalid argument */ AC_ERR_OTHER = (1 << 8), /* unknown */ AC_ERR_NODEV_HINT = (1 << 9), /* polling device detection hint */ AC_ERR_NCQ = (1 << 10), /* marker for offending NCQ qc */ }; /* * Link power management policy: If you alter this, you also need to * alter libata-scsi.c (for the ascii descriptions) */ enum ata_lpm_policy { ATA_LPM_UNKNOWN, ATA_LPM_MAX_POWER, ATA_LPM_MED_POWER, ATA_LPM_MED_POWER_WITH_DIPM, /* Med power + DIPM as win IRST does */ ATA_LPM_MIN_POWER_WITH_PARTIAL, /* Min Power + partial and slumber */ ATA_LPM_MIN_POWER, /* Min power + no partial (slumber only) */ }; enum ata_lpm_hints { ATA_LPM_EMPTY = (1 << 0), /* port empty/probing */ ATA_LPM_HIPM = (1 << 1), /* may use HIPM */ ATA_LPM_WAKE_ONLY = (1 << 2), /* only wake up link */ }; /* forward declarations */ struct scsi_device; struct ata_port_operations; struct ata_port; struct ata_link; struct ata_queued_cmd; /* typedefs */ typedef void (*ata_qc_cb_t) (struct ata_queued_cmd *qc); typedef int (*ata_prereset_fn_t)(struct ata_link *link, unsigned long deadline); typedef int (*ata_reset_fn_t)(struct ata_link *link, unsigned int *classes, unsigned long deadline); typedef void (*ata_postreset_fn_t)(struct ata_link *link, unsigned int *classes); extern struct device_attribute dev_attr_unload_heads; #ifdef CONFIG_SATA_HOST extern struct device_attribute dev_attr_link_power_management_policy; extern struct device_attribute dev_attr_ncq_prio_enable; extern struct device_attribute dev_attr_em_message_type; extern struct device_attribute dev_attr_em_message; extern struct device_attribute dev_attr_sw_activity; #endif enum sw_activity { OFF, BLINK_ON, BLINK_OFF, }; struct ata_taskfile { unsigned long flags; /* ATA_TFLAG_xxx */ u8 protocol; /* ATA_PROT_xxx */ u8 ctl; /* control reg */ u8 hob_feature; /* additional data */ u8 hob_nsect; /* to support LBA48 */ u8 hob_lbal; u8 hob_lbam; u8 hob_lbah; u8 feature; u8 nsect; u8 lbal; u8 lbam; u8 lbah; u8 device; u8 command; /* IO operation */ u32 auxiliary; /* auxiliary field */ /* from SATA 3.1 and */ /* ATA-8 ACS-3 */ }; #ifdef CONFIG_ATA_SFF struct ata_ioports { void __iomem *cmd_addr; void __iomem *data_addr; void __iomem *error_addr; void __iomem *feature_addr; void __iomem *nsect_addr; void __iomem *lbal_addr; void __iomem *lbam_addr; void __iomem *lbah_addr; void __iomem *device_addr; void __iomem *status_addr; void __iomem *command_addr; void __iomem *altstatus_addr; void __iomem *ctl_addr; #ifdef CONFIG_ATA_BMDMA void __iomem *bmdma_addr; #endif /* CONFIG_ATA_BMDMA */ void __iomem *scr_addr; }; #endif /* CONFIG_ATA_SFF */ struct ata_host { spinlock_t lock; struct device *dev; void __iomem * const *iomap; unsigned int n_ports; unsigned int n_tags; /* nr of NCQ tags */ void *private_data; struct ata_port_operations *ops; unsigned long flags; struct kref kref; struct mutex eh_mutex; struct task_struct *eh_owner; struct ata_port *simplex_claimed; /* channel owning the DMA */ struct ata_port *ports[]; }; struct ata_queued_cmd { struct ata_port *ap; struct ata_device *dev; struct scsi_cmnd *scsicmd; void (*scsidone)(struct scsi_cmnd *); struct ata_taskfile tf; u8 cdb[ATAPI_CDB_LEN]; unsigned long flags; /* ATA_QCFLAG_xxx */ unsigned int tag; /* libata core tag */ unsigned int hw_tag; /* driver tag */ unsigned int n_elem; unsigned int orig_n_elem; int dma_dir; unsigned int sect_size; unsigned int nbytes; unsigned int extrabytes; unsigned int curbytes; struct scatterlist sgent; struct scatterlist *sg; struct scatterlist *cursg; unsigned int cursg_ofs; unsigned int err_mask; struct ata_taskfile result_tf; ata_qc_cb_t complete_fn; void *private_data; void *lldd_task; }; struct ata_port_stats { unsigned long unhandled_irq; unsigned long idle_irq; unsigned long rw_reqbuf; }; struct ata_ering_entry { unsigned int eflags; unsigned int err_mask; u64 timestamp; }; struct ata_ering { int cursor; struct ata_ering_entry ring[ATA_ERING_SIZE]; }; struct ata_device { struct ata_link *link; unsigned int devno; /* 0 or 1 */ unsigned int horkage; /* List of broken features */ unsigned long flags; /* ATA_DFLAG_xxx */ struct scsi_device *sdev; /* attached SCSI device */ void *private_data; #ifdef CONFIG_ATA_ACPI union acpi_object *gtf_cache; unsigned int gtf_filter; #endif #ifdef CONFIG_SATA_ZPODD void *zpodd; #endif struct device tdev; /* n_sector is CLEAR_BEGIN, read comment above CLEAR_BEGIN */ u64 n_sectors; /* size of device, if ATA */ u64 n_native_sectors; /* native size, if ATA */ unsigned int class; /* ATA_DEV_xxx */ unsigned long unpark_deadline; u8 pio_mode; u8 dma_mode; u8 xfer_mode; unsigned int xfer_shift; /* ATA_SHIFT_xxx */ unsigned int multi_count; /* sectors count for READ/WRITE MULTIPLE */ unsigned int max_sectors; /* per-device max sectors */ unsigned int cdb_len; /* per-dev xfer mask */ unsigned long pio_mask; unsigned long mwdma_mask; unsigned long udma_mask; /* for CHS addressing */ u16 cylinders; /* Number of cylinders */ u16 heads; /* Number of heads */ u16 sectors; /* Number of sectors per track */ union { u16 id[ATA_ID_WORDS]; /* IDENTIFY xxx DEVICE data */ u32 gscr[SATA_PMP_GSCR_DWORDS]; /* PMP GSCR block */ } ____cacheline_aligned; /* DEVSLP Timing Variables from Identify Device Data Log */ u8 devslp_timing[ATA_LOG_DEVSLP_SIZE]; /* NCQ send and receive log subcommand support */ u8 ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_SIZE]; u8 ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_SIZE]; /* ZAC zone configuration */ u32 zac_zoned_cap; u32 zac_zones_optimal_open; u32 zac_zones_optimal_nonseq; u32 zac_zones_max_open; /* error history */ int spdn_cnt; /* ering is CLEAR_END, read comment above CLEAR_END */ struct ata_ering ering; }; /* Fields between ATA_DEVICE_CLEAR_BEGIN and ATA_DEVICE_CLEAR_END are * cleared to zero on ata_dev_init(). */ #define ATA_DEVICE_CLEAR_BEGIN offsetof(struct ata_device, n_sectors) #define ATA_DEVICE_CLEAR_END offsetof(struct ata_device, ering) struct ata_eh_info { struct ata_device *dev; /* offending device */ u32 serror; /* SError from LLDD */ unsigned int err_mask; /* port-wide err_mask */ unsigned int action; /* ATA_EH_* action mask */ unsigned int dev_action[ATA_MAX_DEVICES]; /* dev EH action */ unsigned int flags; /* ATA_EHI_* flags */ unsigned int probe_mask; char desc[ATA_EH_DESC_LEN]; int desc_len; }; struct ata_eh_context { struct ata_eh_info i; int tries[ATA_MAX_DEVICES]; int cmd_timeout_idx[ATA_MAX_DEVICES] [ATA_EH_CMD_TIMEOUT_TABLE_SIZE]; unsigned int classes[ATA_MAX_DEVICES]; unsigned int did_probe_mask; unsigned int unloaded_mask; unsigned int saved_ncq_enabled; u8 saved_xfer_mode[ATA_MAX_DEVICES]; /* timestamp for the last reset attempt or success */ unsigned long last_reset; }; struct ata_acpi_drive { u32 pio; u32 dma; } __packed; struct ata_acpi_gtm { struct ata_acpi_drive drive[2]; u32 flags; } __packed; struct ata_link { struct ata_port *ap; int pmp; /* port multiplier port # */ struct device tdev; unsigned int active_tag; /* active tag on this link */ u32 sactive; /* active NCQ commands */ unsigned int flags; /* ATA_LFLAG_xxx */ u32 saved_scontrol; /* SControl on probe */ unsigned int hw_sata_spd_limit; unsigned int sata_spd_limit; unsigned int sata_spd; /* current SATA PHY speed */ enum ata_lpm_policy lpm_policy; /* record runtime error info, protected by host_set lock */ struct ata_eh_info eh_info; /* EH context */ struct ata_eh_context eh_context; struct ata_device device[ATA_MAX_DEVICES]; unsigned long last_lpm_change; /* when last LPM change happened */ }; #define ATA_LINK_CLEAR_BEGIN offsetof(struct ata_link, active_tag) #define ATA_LINK_CLEAR_END offsetof(struct ata_link, device[0]) struct ata_port { struct Scsi_Host *scsi_host; /* our co-allocated scsi host */ struct ata_port_operations *ops; spinlock_t *lock; /* Flags owned by the EH context. Only EH should touch these once the port is active */ unsigned long flags; /* ATA_FLAG_xxx */ /* Flags that change dynamically, protected by ap->lock */ unsigned int pflags; /* ATA_PFLAG_xxx */ unsigned int print_id; /* user visible unique port ID */ unsigned int local_port_no; /* host local port num */ unsigned int port_no; /* 0 based port no. inside the host */ #ifdef CONFIG_ATA_SFF struct ata_ioports ioaddr; /* ATA cmd/ctl/dma register blocks */ u8 ctl; /* cache of ATA control register */ u8 last_ctl; /* Cache last written value */ struct ata_link* sff_pio_task_link; /* link currently used */ struct delayed_work sff_pio_task; #ifdef CONFIG_ATA_BMDMA struct ata_bmdma_prd *bmdma_prd; /* BMDMA SG list */ dma_addr_t bmdma_prd_dma; /* and its DMA mapping */ #endif /* CONFIG_ATA_BMDMA */ #endif /* CONFIG_ATA_SFF */ unsigned int pio_mask; unsigned int mwdma_mask; unsigned int udma_mask; unsigned int cbl; /* cable type; ATA_CBL_xxx */ struct ata_queued_cmd qcmd[ATA_MAX_QUEUE + 1]; unsigned long sas_tag_allocated; /* for sas tag allocation only */ u64 qc_active; int nr_active_links; /* #links with active qcs */ unsigned int sas_last_tag; /* track next tag hw expects */ struct ata_link link; /* host default link */ struct ata_link *slave_link; /* see ata_slave_link_init() */ int nr_pmp_links; /* nr of available PMP links */ struct ata_link *pmp_link; /* array of PMP links */ struct ata_link *excl_link; /* for PMP qc exclusion */ struct ata_port_stats stats; struct ata_host *host; struct device *dev; struct device tdev; struct mutex scsi_scan_mutex; struct delayed_work hotplug_task; struct work_struct scsi_rescan_task; unsigned int hsm_task_state; u32 msg_enable; struct list_head eh_done_q; wait_queue_head_t eh_wait_q; int eh_tries; struct completion park_req_pending; pm_message_t pm_mesg; enum ata_lpm_policy target_lpm_policy; struct timer_list fastdrain_timer; unsigned long fastdrain_cnt; async_cookie_t cookie; int em_message_type; void *private_data; #ifdef CONFIG_ATA_ACPI struct ata_acpi_gtm __acpi_init_gtm; /* use ata_acpi_init_gtm() */ #endif /* owned by EH */ u8 sector_buf[ATA_SECT_SIZE] ____cacheline_aligned; }; /* The following initializer overrides a method to NULL whether one of * its parent has the method defined or not. This is equivalent to * ERR_PTR(-ENOENT). Unfortunately, ERR_PTR doesn't render a constant * expression and thus can't be used as an initializer. */ #define ATA_OP_NULL (void *)(unsigned long)(-ENOENT) struct ata_port_operations { /* * Command execution */ int (*qc_defer)(struct ata_queued_cmd *qc); int (*check_atapi_dma)(struct ata_queued_cmd *qc); enum ata_completion_errors (*qc_prep)(struct ata_queued_cmd *qc); unsigned int (*qc_issue)(struct ata_queued_cmd *qc); bool (*qc_fill_rtf)(struct ata_queued_cmd *qc); /* * Configuration and exception handling */ int (*cable_detect)(struct ata_port *ap); unsigned long (*mode_filter)(struct ata_device *dev, unsigned long xfer_mask); void (*set_piomode)(struct ata_port *ap, struct ata_device *dev); void (*set_dmamode)(struct ata_port *ap, struct ata_device *dev); int (*set_mode)(struct ata_link *link, struct ata_device **r_failed_dev); unsigned int (*read_id)(struct ata_device *dev, struct ata_taskfile *tf, u16 *id); void (*dev_config)(struct ata_device *dev); void (*freeze)(struct ata_port *ap); void (*thaw)(struct ata_port *ap); ata_prereset_fn_t prereset; ata_reset_fn_t softreset; ata_reset_fn_t hardreset; ata_postreset_fn_t postreset; ata_prereset_fn_t pmp_prereset; ata_reset_fn_t pmp_softreset; ata_reset_fn_t pmp_hardreset; ata_postreset_fn_t pmp_postreset; void (*error_handler)(struct ata_port *ap); void (*lost_interrupt)(struct ata_port *ap); void (*post_internal_cmd)(struct ata_queued_cmd *qc); void (*sched_eh)(struct ata_port *ap); void (*end_eh)(struct ata_port *ap); /* * Optional features */ int (*scr_read)(struct ata_link *link, unsigned int sc_reg, u32 *val); int (*scr_write)(struct ata_link *link, unsigned int sc_reg, u32 val); void (*pmp_attach)(struct ata_port *ap); void (*pmp_detach)(struct ata_port *ap); int (*set_lpm)(struct ata_link *link, enum ata_lpm_policy policy, unsigned hints); /* * Start, stop, suspend and resume */ int (*port_suspend)(struct ata_port *ap, pm_message_t mesg); int (*port_resume)(struct ata_port *ap); int (*port_start)(struct ata_port *ap); void (*port_stop)(struct ata_port *ap); void (*host_stop)(struct ata_host *host); #ifdef CONFIG_ATA_SFF /* * SFF / taskfile oriented ops */ void (*sff_dev_select)(struct ata_port *ap, unsigned int device); void (*sff_set_devctl)(struct ata_port *ap, u8 ctl); u8 (*sff_check_status)(struct ata_port *ap); u8 (*sff_check_altstatus)(struct ata_port *ap); void (*sff_tf_load)(struct ata_port *ap, const struct ata_taskfile *tf); void (*sff_tf_read)(struct ata_port *ap, struct ata_taskfile *tf); void (*sff_exec_command)(struct ata_port *ap, const struct ata_taskfile *tf); unsigned int (*sff_data_xfer)(struct ata_queued_cmd *qc, unsigned char *buf, unsigned int buflen, int rw); void (*sff_irq_on)(struct ata_port *); bool (*sff_irq_check)(struct ata_port *); void (*sff_irq_clear)(struct ata_port *); void (*sff_drain_fifo)(struct ata_queued_cmd *qc); #ifdef CONFIG_ATA_BMDMA void (*bmdma_setup)(struct ata_queued_cmd *qc); void (*bmdma_start)(struct ata_queued_cmd *qc); void (*bmdma_stop)(struct ata_queued_cmd *qc); u8 (*bmdma_status)(struct ata_port *ap); #endif /* CONFIG_ATA_BMDMA */ #endif /* CONFIG_ATA_SFF */ ssize_t (*em_show)(struct ata_port *ap, char *buf); ssize_t (*em_store)(struct ata_port *ap, const char *message, size_t size); ssize_t (*sw_activity_show)(struct ata_device *dev, char *buf); ssize_t (*sw_activity_store)(struct ata_device *dev, enum sw_activity val); ssize_t (*transmit_led_message)(struct ata_port *ap, u32 state, ssize_t size); /* * Obsolete */ void (*phy_reset)(struct ata_port *ap); void (*eng_timeout)(struct ata_port *ap); /* * ->inherits must be the last field and all the preceding * fields must be pointers. */ const struct ata_port_operations *inherits; }; struct ata_port_info { unsigned long flags; unsigned long link_flags; unsigned long pio_mask; unsigned long mwdma_mask; unsigned long udma_mask; struct ata_port_operations *port_ops; void *private_data; }; struct ata_timing { unsigned short mode; /* ATA mode */ unsigned short setup; /* t1 */ unsigned short act8b; /* t2 for 8-bit I/O */ unsigned short rec8b; /* t2i for 8-bit I/O */ unsigned short cyc8b; /* t0 for 8-bit I/O */ unsigned short active; /* t2 or tD */ unsigned short recover; /* t2i or tK */ unsigned short dmack_hold; /* tj */ unsigned short cycle; /* t0 */ unsigned short udma; /* t2CYCTYP/2 */ }; /* * Core layer - drivers/ata/libata-core.c */ extern struct ata_port_operations ata_dummy_port_ops; extern const struct ata_port_info ata_dummy_port_info; static inline bool ata_is_atapi(u8 prot) { return prot & ATA_PROT_FLAG_ATAPI; } static inline bool ata_is_pio(u8 prot) { return prot & ATA_PROT_FLAG_PIO; } static inline bool ata_is_dma(u8 prot) { return prot & ATA_PROT_FLAG_DMA; } static inline bool ata_is_ncq(u8 prot) { return prot & ATA_PROT_FLAG_NCQ; } static inline bool ata_is_data(u8 prot) { return prot & (ATA_PROT_FLAG_PIO | ATA_PROT_FLAG_DMA); } static inline int is_multi_taskfile(struct ata_taskfile *tf) { return (tf->command == ATA_CMD_READ_MULTI) || (tf->command == ATA_CMD_WRITE_MULTI) || (tf->command == ATA_CMD_READ_MULTI_EXT) || (tf->command == ATA_CMD_WRITE_MULTI_EXT) || (tf->command == ATA_CMD_WRITE_MULTI_FUA_EXT); } static inline int ata_port_is_dummy(struct ata_port *ap) { return ap->ops == &ata_dummy_port_ops; } extern int ata_std_prereset(struct ata_link *link, unsigned long deadline); extern int ata_wait_after_reset(struct ata_link *link, unsigned long deadline, int (*check_ready)(struct ata_link *link)); extern int sata_std_hardreset(struct ata_link *link, unsigned int *class, unsigned long deadline); extern void ata_std_postreset(struct ata_link *link, unsigned int *classes); extern struct ata_host *ata_host_alloc(struct device *dev, int max_ports); extern struct ata_host *ata_host_alloc_pinfo(struct device *dev, const struct ata_port_info * const * ppi, int n_ports); extern void ata_host_get(struct ata_host *host); extern void ata_host_put(struct ata_host *host); extern int ata_host_start(struct ata_host *host); extern int ata_host_register(struct ata_host *host, struct scsi_host_template *sht); extern int ata_host_activate(struct ata_host *host, int irq, irq_handler_t irq_handler, unsigned long irq_flags, struct scsi_host_template *sht); extern void ata_host_detach(struct ata_host *host); extern void ata_host_init(struct ata_host *, struct device *, struct ata_port_operations *); extern int ata_scsi_detect(struct scsi_host_template *sht); extern int ata_scsi_ioctl(struct scsi_device *dev, unsigned int cmd, void __user *arg); #ifdef CONFIG_COMPAT #define ATA_SCSI_COMPAT_IOCTL .compat_ioctl = ata_scsi_ioctl, #else #define ATA_SCSI_COMPAT_IOCTL /* empty */ #endif extern int ata_scsi_queuecmd(struct Scsi_Host *h, struct scsi_cmnd *cmd); #if IS_REACHABLE(CONFIG_ATA) bool ata_scsi_dma_need_drain(struct request *rq); #else #define ata_scsi_dma_need_drain NULL #endif extern int ata_sas_scsi_ioctl(struct ata_port *ap, struct scsi_device *dev, unsigned int cmd, void __user *arg); extern bool ata_link_online(struct ata_link *link); extern bool ata_link_offline(struct ata_link *link); #ifdef CONFIG_PM extern int ata_host_suspend(struct ata_host *host, pm_message_t mesg); extern void ata_host_resume(struct ata_host *host); extern void ata_sas_port_suspend(struct ata_port *ap); extern void ata_sas_port_resume(struct ata_port *ap); #else static inline void ata_sas_port_suspend(struct ata_port *ap) { } static inline void ata_sas_port_resume(struct ata_port *ap) { } #endif extern int ata_ratelimit(void); extern void ata_msleep(struct ata_port *ap, unsigned int msecs); extern u32 ata_wait_register(struct ata_port *ap, void __iomem *reg, u32 mask, u32 val, unsigned long interval, unsigned long timeout); extern int atapi_cmd_type(u8 opcode); extern unsigned long ata_pack_xfermask(unsigned long pio_mask, unsigned long mwdma_mask, unsigned long udma_mask); extern void ata_unpack_xfermask(unsigned long xfer_mask, unsigned long *pio_mask, unsigned long *mwdma_mask, unsigned long *udma_mask); extern u8 ata_xfer_mask2mode(unsigned long xfer_mask); extern unsigned long ata_xfer_mode2mask(u8 xfer_mode); extern int ata_xfer_mode2shift(unsigned long xfer_mode); extern const char *ata_mode_string(unsigned long xfer_mask); extern unsigned long ata_id_xfermask(const u16 *id); extern int ata_std_qc_defer(struct ata_queued_cmd *qc); extern enum ata_completion_errors ata_noop_qc_prep(struct ata_queued_cmd *qc); extern void ata_sg_init(struct ata_queued_cmd *qc, struct scatterlist *sg, unsigned int n_elem); extern unsigned int ata_dev_classify(const struct ata_taskfile *tf); extern void ata_dev_disable(struct ata_device *adev); extern void ata_id_string(const u16 *id, unsigned char *s, unsigned int ofs, unsigned int len); extern void ata_id_c_string(const u16 *id, unsigned char *s, unsigned int ofs, unsigned int len); extern unsigned int ata_do_dev_read_id(struct ata_device *dev, struct ata_taskfile *tf, u16 *id); extern void ata_qc_complete(struct ata_queued_cmd *qc); extern u64 ata_qc_get_active(struct ata_port *ap); extern void ata_scsi_simulate(struct ata_device *dev, struct scsi_cmnd *cmd); extern int ata_std_bios_param(struct scsi_device *sdev, struct block_device *bdev, sector_t capacity, int geom[]); extern void ata_scsi_unlock_native_capacity(struct scsi_device *sdev); extern int ata_scsi_slave_config(struct scsi_device *sdev); extern void ata_scsi_slave_destroy(struct scsi_device *sdev); extern int ata_scsi_change_queue_depth(struct scsi_device *sdev, int queue_depth); extern int __ata_change_queue_depth(struct ata_port *ap, struct scsi_device *sdev, int queue_depth); extern struct ata_device *ata_dev_pair(struct ata_device *adev); extern int ata_do_set_mode(struct ata_link *link, struct ata_device **r_failed_dev); extern void ata_scsi_port_error_handler(struct Scsi_Host *host, struct ata_port *ap); extern void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap, struct list_head *eh_q); /* * SATA specific code - drivers/ata/libata-sata.c */ #ifdef CONFIG_SATA_HOST extern const unsigned long sata_deb_timing_normal[]; extern const unsigned long sata_deb_timing_hotplug[]; extern const unsigned long sata_deb_timing_long[]; static inline const unsigned long * sata_ehc_deb_timing(struct ata_eh_context *ehc) { if (ehc->i.flags & ATA_EHI_HOTPLUGGED) return sata_deb_timing_hotplug; else return sata_deb_timing_normal; } extern int sata_scr_valid(struct ata_link *link); extern int sata_scr_read(struct ata_link *link, int reg, u32 *val); extern int sata_scr_write(struct ata_link *link, int reg, u32 val); extern int sata_scr_write_flush(struct ata_link *link, int reg, u32 val); extern int sata_set_spd(struct ata_link *link); extern int sata_link_hardreset(struct ata_link *link, const unsigned long *timing, unsigned long deadline, bool *online, int (*check_ready)(struct ata_link *)); extern int sata_link_resume(struct ata_link *link, const unsigned long *params, unsigned long deadline); extern void ata_eh_analyze_ncq_error(struct ata_link *link); #else static inline const unsigned long * sata_ehc_deb_timing(struct ata_eh_context *ehc) { return NULL; } static inline int sata_scr_valid(struct ata_link *link) { return 0; } static inline int sata_scr_read(struct ata_link *link, int reg, u32 *val) { return -EOPNOTSUPP; } static inline int sata_scr_write(struct ata_link *link, int reg, u32 val) { return -EOPNOTSUPP; } static inline int sata_scr_write_flush(struct ata_link *link, int reg, u32 val) { return -EOPNOTSUPP; } static inline int sata_set_spd(struct ata_link *link) { return -EOPNOTSUPP; } static inline int sata_link_hardreset(struct ata_link *link, const unsigned long *timing, unsigned long deadline, bool *online, int (*check_ready)(struct ata_link *)) { if (online) *online = false; return -EOPNOTSUPP; } static inline int sata_link_resume(struct ata_link *link, const unsigned long *params, unsigned long deadline) { return -EOPNOTSUPP; } static inline void ata_eh_analyze_ncq_error(struct ata_link *link) { } #endif extern int sata_link_debounce(struct ata_link *link, const unsigned long *params, unsigned long deadline); extern int sata_link_scr_lpm(struct ata_link *link, enum ata_lpm_policy policy, bool spm_wakeup); extern int ata_slave_link_init(struct ata_port *ap); extern void ata_sas_port_destroy(struct ata_port *); extern struct ata_port *ata_sas_port_alloc(struct ata_host *, struct ata_port_info *, struct Scsi_Host *); extern void ata_sas_async_probe(struct ata_port *ap); extern int ata_sas_sync_probe(struct ata_port *ap); extern int ata_sas_port_init(struct ata_port *); extern int ata_sas_port_start(struct ata_port *ap); extern int ata_sas_tport_add(struct device *parent, struct ata_port *ap); extern void ata_sas_tport_delete(struct ata_port *ap); extern void ata_sas_port_stop(struct ata_port *ap); extern int ata_sas_slave_configure(struct scsi_device *, struct ata_port *); extern int ata_sas_queuecmd(struct scsi_cmnd *cmd, struct ata_port *ap); extern void ata_tf_to_fis(const struct ata_taskfile *tf, u8 pmp, int is_cmd, u8 *fis); extern void ata_tf_from_fis(const u8 *fis, struct ata_taskfile *tf); extern int ata_qc_complete_multiple(struct ata_port *ap, u64 qc_active); extern bool sata_lpm_ignore_phy_events(struct ata_link *link); extern int sata_async_notification(struct ata_port *ap); extern int ata_cable_40wire(struct ata_port *ap); extern int ata_cable_80wire(struct ata_port *ap); extern int ata_cable_sata(struct ata_port *ap); extern int ata_cable_ignore(struct ata_port *ap); extern int ata_cable_unknown(struct ata_port *ap); /* Timing helpers */ extern unsigned int ata_pio_need_iordy(const struct ata_device *); extern u8 ata_timing_cycle2mode(unsigned int xfer_shift, int cycle); /* PCI */ #ifdef CONFIG_PCI struct pci_dev; struct pci_bits { unsigned int reg; /* PCI config register to read */ unsigned int width; /* 1 (8 bit), 2 (16 bit), 4 (32 bit) */ unsigned long mask; unsigned long val; }; extern int pci_test_config_bits(struct pci_dev *pdev, const struct pci_bits *bits); extern void ata_pci_shutdown_one(struct pci_dev *pdev); extern void ata_pci_remove_one(struct pci_dev *pdev); #ifdef CONFIG_PM extern void ata_pci_device_do_suspend(struct pci_dev *pdev, pm_message_t mesg); extern int __must_check ata_pci_device_do_resume(struct pci_dev *pdev); extern int ata_pci_device_suspend(struct pci_dev *pdev, pm_message_t mesg); extern int ata_pci_device_resume(struct pci_dev *pdev); #endif /* CONFIG_PM */ #endif /* CONFIG_PCI */ struct platform_device; extern int ata_platform_remove_one(struct platform_device *pdev); /* * ACPI - drivers/ata/libata-acpi.c */ #ifdef CONFIG_ATA_ACPI static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap) { if (ap->pflags & ATA_PFLAG_INIT_GTM_VALID) return &ap->__acpi_init_gtm; return NULL; } int ata_acpi_stm(struct ata_port *ap, const struct ata_acpi_gtm *stm); int ata_acpi_gtm(struct ata_port *ap, struct ata_acpi_gtm *stm); unsigned long ata_acpi_gtm_xfermask(struct ata_device *dev, const struct ata_acpi_gtm *gtm); int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm); #else static inline const struct ata_acpi_gtm *ata_acpi_init_gtm(struct ata_port *ap) { return NULL; } static inline int ata_acpi_stm(const struct ata_port *ap, struct ata_acpi_gtm *stm) { return -ENOSYS; } static inline int ata_acpi_gtm(const struct ata_port *ap, struct ata_acpi_gtm *stm) { return -ENOSYS; } static inline unsigned int ata_acpi_gtm_xfermask(struct ata_device *dev, const struct ata_acpi_gtm *gtm) { return 0; } static inline int ata_acpi_cbl_80wire(struct ata_port *ap, const struct ata_acpi_gtm *gtm) { return 0; } #endif /* * EH - drivers/ata/libata-eh.c */ extern void ata_port_schedule_eh(struct ata_port *ap); extern void ata_port_wait_eh(struct ata_port *ap); extern int ata_link_abort(struct ata_link *link); extern int ata_port_abort(struct ata_port *ap); extern int ata_port_freeze(struct ata_port *ap); extern void ata_eh_freeze_port(struct ata_port *ap); extern void ata_eh_thaw_port(struct ata_port *ap); extern void ata_eh_qc_complete(struct ata_queued_cmd *qc); extern void ata_eh_qc_retry(struct ata_queued_cmd *qc); extern void ata_do_eh(struct ata_port *ap, ata_prereset_fn_t prereset, ata_reset_fn_t softreset, ata_reset_fn_t hardreset, ata_postreset_fn_t postreset); extern void ata_std_error_handler(struct ata_port *ap); extern void ata_std_sched_eh(struct ata_port *ap); extern void ata_std_end_eh(struct ata_port *ap); extern int ata_link_nr_enabled(struct ata_link *link); /* * Base operations to inherit from and initializers for sht * * Operations * * base : Common to all libata drivers. * sata : SATA controllers w/ native interface. * pmp : SATA controllers w/ PMP support. * sff : SFF ATA controllers w/o BMDMA support. * bmdma : SFF ATA controllers w/ BMDMA support. * * sht initializers * * BASE : Common to all libata drivers. The user must set * sg_tablesize and dma_boundary. * PIO : SFF ATA controllers w/ only PIO support. * BMDMA : SFF ATA controllers w/ BMDMA support. sg_tablesize and * dma_boundary are set to BMDMA limits. * NCQ : SATA controllers supporting NCQ. The user must set * sg_tablesize, dma_boundary and can_queue. */ extern const struct ata_port_operations ata_base_port_ops; extern const struct ata_port_operations sata_port_ops; extern struct device_attribute *ata_common_sdev_attrs[]; /* * All sht initializers (BASE, PIO, BMDMA, NCQ) must be instantiated * by the edge drivers. Because the 'module' field of sht must be the * edge driver's module reference, otherwise the driver can be unloaded * even if the scsi_device is being accessed. */ #define __ATA_BASE_SHT(drv_name) \ .module = THIS_MODULE, \ .name = drv_name, \ .ioctl = ata_scsi_ioctl, \ ATA_SCSI_COMPAT_IOCTL \ .queuecommand = ata_scsi_queuecmd, \ .dma_need_drain = ata_scsi_dma_need_drain, \ .can_queue = ATA_DEF_QUEUE, \ .tag_alloc_policy = BLK_TAG_ALLOC_RR, \ .this_id = ATA_SHT_THIS_ID, \ .emulated = ATA_SHT_EMULATED, \ .proc_name = drv_name, \ .slave_configure = ata_scsi_slave_config, \ .slave_destroy = ata_scsi_slave_destroy, \ .bios_param = ata_std_bios_param, \ .unlock_native_capacity = ata_scsi_unlock_native_capacity #define ATA_BASE_SHT(drv_name) \ __ATA_BASE_SHT(drv_name), \ .sdev_attrs = ata_common_sdev_attrs #ifdef CONFIG_SATA_HOST extern struct device_attribute *ata_ncq_sdev_attrs[]; #define ATA_NCQ_SHT(drv_name) \ __ATA_BASE_SHT(drv_name), \ .sdev_attrs = ata_ncq_sdev_attrs, \ .change_queue_depth = ata_scsi_change_queue_depth #endif /* * PMP helpers */ #ifdef CONFIG_SATA_PMP static inline bool sata_pmp_supported(struct ata_port *ap) { return ap->flags & ATA_FLAG_PMP; } static inline bool sata_pmp_attached(struct ata_port *ap) { return ap->nr_pmp_links != 0; } static inline bool ata_is_host_link(const struct ata_link *link) { return link == &link->ap->link || link == link->ap->slave_link; } #else /* CONFIG_SATA_PMP */ static inline bool sata_pmp_supported(struct ata_port *ap) { return false; } static inline bool sata_pmp_attached(struct ata_port *ap) { return false; } static inline bool ata_is_host_link(const struct ata_link *link) { return 1; } #endif /* CONFIG_SATA_PMP */ static inline int sata_srst_pmp(struct ata_link *link) { if (sata_pmp_supported(link->ap) && ata_is_host_link(link)) return SATA_PMP_CTRL_PORT; return link->pmp; } /* * printk helpers */ __printf(3, 4) void ata_port_printk(const struct ata_port *ap, const char *level, const char *fmt, ...); __printf(3, 4) void ata_link_printk(const struct ata_link *link, const char *level, const char *fmt, ...); __printf(3, 4) void ata_dev_printk(const struct ata_device *dev, const char *level, const char *fmt, ...); #define ata_port_err(ap, fmt, ...) \ ata_port_printk(ap, KERN_ERR, fmt, ##__VA_ARGS__) #define ata_port_warn(ap, fmt, ...) \ ata_port_printk(ap, KERN_WARNING, fmt, ##__VA_ARGS__) #define ata_port_notice(ap, fmt, ...) \ ata_port_printk(ap, KERN_NOTICE, fmt, ##__VA_ARGS__) #define ata_port_info(ap, fmt, ...) \ ata_port_printk(ap, KERN_INFO, fmt, ##__VA_ARGS__) #define ata_port_dbg(ap, fmt, ...) \ ata_port_printk(ap, KERN_DEBUG, fmt, ##__VA_ARGS__) #define ata_link_err(link, fmt, ...) \ ata_link_printk(link, KERN_ERR, fmt, ##__VA_ARGS__) #define ata_link_warn(link, fmt, ...) \ ata_link_printk(link, KERN_WARNING, fmt, ##__VA_ARGS__) #define ata_link_notice(link, fmt, ...) \ ata_link_printk(link, KERN_NOTICE, fmt, ##__VA_ARGS__) #define ata_link_info(link, fmt, ...) \ ata_link_printk(link, KERN_INFO, fmt, ##__VA_ARGS__) #define ata_link_dbg(link, fmt, ...) \ ata_link_printk(link, KERN_DEBUG, fmt, ##__VA_ARGS__) #define ata_dev_err(dev, fmt, ...) \ ata_dev_printk(dev, KERN_ERR, fmt, ##__VA_ARGS__) #define ata_dev_warn(dev, fmt, ...) \ ata_dev_printk(dev, KERN_WARNING, fmt, ##__VA_ARGS__) #define ata_dev_notice(dev, fmt, ...) \ ata_dev_printk(dev, KERN_NOTICE, fmt, ##__VA_ARGS__) #define ata_dev_info(dev, fmt, ...) \ ata_dev_printk(dev, KERN_INFO, fmt, ##__VA_ARGS__) #define ata_dev_dbg(dev, fmt, ...) \ ata_dev_printk(dev, KERN_DEBUG, fmt, ##__VA_ARGS__) void ata_print_version(const struct device *dev, const char *version); /* * ata_eh_info helpers */ extern __printf(2, 3) void __ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...); extern __printf(2, 3) void ata_ehi_push_desc(struct ata_eh_info *ehi, const char *fmt, ...); extern void ata_ehi_clear_desc(struct ata_eh_info *ehi); static inline void ata_ehi_hotplugged(struct ata_eh_info *ehi) { ehi->probe_mask |= (1 << ATA_MAX_DEVICES) - 1; ehi->flags |= ATA_EHI_HOTPLUGGED; ehi->action |= ATA_EH_RESET | ATA_EH_ENABLE_LINK; ehi->err_mask |= AC_ERR_ATA_BUS; } /* * port description helpers */ extern __printf(2, 3) void ata_port_desc(struct ata_port *ap, const char *fmt, ...); #ifdef CONFIG_PCI extern void ata_port_pbar_desc(struct ata_port *ap, int bar, ssize_t offset, const char *name); #endif static inline bool ata_tag_internal(unsigned int tag) { return tag == ATA_TAG_INTERNAL; } static inline bool ata_tag_valid(unsigned int tag) { return tag < ATA_MAX_QUEUE || ata_tag_internal(tag); } #define __ata_qc_for_each(ap, qc, tag, max_tag, fn) \ for ((tag) = 0; (tag) < (max_tag) && \ ({ qc = fn((ap), (tag)); 1; }); (tag)++) \ /* * Internal use only, iterate commands ignoring error handling and * status of 'qc'. */ #define ata_qc_for_each_raw(ap, qc, tag) \ __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE, __ata_qc_from_tag) /* * Iterate all potential commands that can be queued */ #define ata_qc_for_each(ap, qc, tag) \ __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE, ata_qc_from_tag) /* * Like ata_qc_for_each, but with the internal tag included */ #define ata_qc_for_each_with_internal(ap, qc, tag) \ __ata_qc_for_each(ap, qc, tag, ATA_MAX_QUEUE + 1, ata_qc_from_tag) /* * device helpers */ static inline unsigned int ata_class_enabled(unsigned int class) { return class == ATA_DEV_ATA || class == ATA_DEV_ATAPI || class == ATA_DEV_PMP || class == ATA_DEV_SEMB || class == ATA_DEV_ZAC; } static inline unsigned int ata_class_disabled(unsigned int class) { return class == ATA_DEV_ATA_UNSUP || class == ATA_DEV_ATAPI_UNSUP || class == ATA_DEV_PMP_UNSUP || class == ATA_DEV_SEMB_UNSUP || class == ATA_DEV_ZAC_UNSUP; } static inline unsigned int ata_class_absent(unsigned int class) { return !ata_class_enabled(class) && !ata_class_disabled(class); } static inline unsigned int ata_dev_enabled(const struct ata_device *dev) { return ata_class_enabled(dev->class); } static inline unsigned int ata_dev_disabled(const struct ata_device *dev) { return ata_class_disabled(dev->class); } static inline unsigned int ata_dev_absent(const struct ata_device *dev) { return ata_class_absent(dev->class); } /* * link helpers */ static inline int ata_link_max_devices(const struct ata_link *link) { if (ata_is_host_link(link) && link->ap->flags & ATA_FLAG_SLAVE_POSS) return 2; return 1; } static inline int ata_link_active(struct ata_link *link) { return ata_tag_valid(link->active_tag) || link->sactive; } /* * Iterators * * ATA_LITER_* constants are used to select link iteration mode and * ATA_DITER_* device iteration mode. * * For a custom iteration directly using ata_{link|dev}_next(), if * @link or @dev, respectively, is NULL, the first element is * returned. @dev and @link can be any valid device or link and the * next element according to the iteration mode will be returned. * After the last element, NULL is returned. */ enum ata_link_iter_mode { ATA_LITER_EDGE, /* if present, PMP links only; otherwise, * host link. no slave link */ ATA_LITER_HOST_FIRST, /* host link followed by PMP or slave links */ ATA_LITER_PMP_FIRST, /* PMP links followed by host link, * slave link still comes after host link */ }; enum ata_dev_iter_mode { ATA_DITER_ENABLED, ATA_DITER_ENABLED_REVERSE, ATA_DITER_ALL, ATA_DITER_ALL_REVERSE, }; extern struct ata_link *ata_link_next(struct ata_link *link, struct ata_port *ap, enum ata_link_iter_mode mode); extern struct ata_device *ata_dev_next(struct ata_device *dev, struct ata_link *link, enum ata_dev_iter_mode mode); /* * Shortcut notation for iterations * * ata_for_each_link() iterates over each link of @ap according to * @mode. @link points to the current link in the loop. @link is * NULL after loop termination. ata_for_each_dev() works the same way * except that it iterates over each device of @link. * * Note that the mode prefixes ATA_{L|D}ITER_ shouldn't need to be * specified when using the following shorthand notations. Only the * mode itself (EDGE, HOST_FIRST, ENABLED, etc...) should be * specified. This not only increases brevity but also makes it * impossible to use ATA_LITER_* for device iteration or vice-versa. */ #define ata_for_each_link(link, ap, mode) \ for ((link) = ata_link_next(NULL, (ap), ATA_LITER_##mode); (link); \ (link) = ata_link_next((link), (ap), ATA_LITER_##mode)) #define ata_for_each_dev(dev, link, mode) \ for ((dev) = ata_dev_next(NULL, (link), ATA_DITER_##mode); (dev); \ (dev) = ata_dev_next((dev), (link), ATA_DITER_##mode)) /** * ata_ncq_enabled - Test whether NCQ is enabled * @dev: ATA device to test for * * LOCKING: * spin_lock_irqsave(host lock) * * RETURNS: * 1 if NCQ is enabled for @dev, 0 otherwise. */ static inline int ata_ncq_enabled(struct ata_device *dev) { if (!IS_ENABLED(CONFIG_SATA_HOST)) return 0; return (dev->flags & (ATA_DFLAG_PIO | ATA_DFLAG_NCQ_OFF | ATA_DFLAG_NCQ)) == ATA_DFLAG_NCQ; } static inline bool ata_fpdma_dsm_supported(struct ata_device *dev) { return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) && (dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_DSM_OFFSET] & ATA_LOG_NCQ_SEND_RECV_DSM_TRIM); } static inline bool ata_fpdma_read_log_supported(struct ata_device *dev) { return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) && (dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_RD_LOG_OFFSET] & ATA_LOG_NCQ_SEND_RECV_RD_LOG_SUPPORTED); } static inline bool ata_fpdma_zac_mgmt_in_supported(struct ata_device *dev) { return (dev->flags & ATA_DFLAG_NCQ_SEND_RECV) && (dev->ncq_send_recv_cmds[ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_OFFSET] & ATA_LOG_NCQ_SEND_RECV_ZAC_MGMT_IN_SUPPORTED); } static inline bool ata_fpdma_zac_mgmt_out_supported(struct ata_device *dev) { return (dev->ncq_non_data_cmds[ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OFFSET] & ATA_LOG_NCQ_NON_DATA_ZAC_MGMT_OUT); } static inline void ata_qc_set_polling(struct ata_queued_cmd *qc) { qc->tf.ctl |= ATA_NIEN; } static inline struct ata_queued_cmd *__ata_qc_from_tag(struct ata_port *ap, unsigned int tag) { if (ata_tag_valid(tag)) return &ap->qcmd[tag]; return NULL; } static inline struct ata_queued_cmd *ata_qc_from_tag(struct ata_port *ap, unsigned int tag) { struct ata_queued_cmd *qc = __ata_qc_from_tag(ap, tag); if (unlikely(!qc) || !ap->ops->error_handler) return qc; if ((qc->flags & (ATA_QCFLAG_ACTIVE | ATA_QCFLAG_FAILED)) == ATA_QCFLAG_ACTIVE) return qc; return NULL; } static inline unsigned int ata_qc_raw_nbytes(struct ata_queued_cmd *qc) { return qc->nbytes - min(qc->extrabytes, qc->nbytes); } static inline void ata_tf_init(struct ata_device *dev, struct ata_taskfile *tf) { memset(tf, 0, sizeof(*tf)); #ifdef CONFIG_ATA_SFF tf->ctl = dev->link->ap->ctl; #else tf->ctl = ATA_DEVCTL_OBS; #endif if (dev->devno == 0) tf->device = ATA_DEVICE_OBS; else tf->device = ATA_DEVICE_OBS | ATA_DEV1; } static inline void ata_qc_reinit(struct ata_queued_cmd *qc) { qc->dma_dir = DMA_NONE; qc->sg = NULL; qc->flags = 0; qc->cursg = NULL; qc->cursg_ofs = 0; qc->nbytes = qc->extrabytes = qc->curbytes = 0; qc->n_elem = 0; qc->err_mask = 0; qc->sect_size = ATA_SECT_SIZE; ata_tf_init(qc->dev, &qc->tf); /* init result_tf such that it indicates normal completion */ qc->result_tf.command = ATA_DRDY; qc->result_tf.feature = 0; } static inline int ata_try_flush_cache(const struct ata_device *dev) { return ata_id_wcache_enabled(dev->id) || ata_id_has_flush(dev->id) || ata_id_has_flush_ext(dev->id); } static inline unsigned int ac_err_mask(u8 status) { if (status & (ATA_BUSY | ATA_DRQ)) return AC_ERR_HSM; if (status & (ATA_ERR | ATA_DF)) return AC_ERR_DEV; return 0; } static inline unsigned int __ac_err_mask(u8 status) { unsigned int mask = ac_err_mask(status); if (mask == 0) return AC_ERR_OTHER; return mask; } static inline struct ata_port *ata_shost_to_port(struct Scsi_Host *host) { return *(struct ata_port **)&host->hostdata[0]; } static inline int ata_check_ready(u8 status) { if (!(status & ATA_BUSY)) return 1; /* 0xff indicates either no device or device not ready */ if (status == 0xff) return -ENODEV; return 0; } static inline unsigned long ata_deadline(unsigned long from_jiffies, unsigned long timeout_msecs) { return from_jiffies + msecs_to_jiffies(timeout_msecs); } /* Don't open code these in drivers as there are traps. Firstly the range may change in future hardware and specs, secondly 0xFF means 'no DMA' but is > UDMA_0. Dyma ddreigiau */ static inline int ata_using_mwdma(struct ata_device *adev) { if (adev->dma_mode >= XFER_MW_DMA_0 && adev->dma_mode <= XFER_MW_DMA_4) return 1; return 0; } static inline int ata_using_udma(struct ata_device *adev) { if (adev->dma_mode >= XFER_UDMA_0 && adev->dma_mode <= XFER_UDMA_7) return 1; return 0; } static inline int ata_dma_enabled(struct ata_device *adev) { return (adev->dma_mode == 0xFF ? 0 : 1); } /************************************************************************** * PATA timings - drivers/ata/libata-pata-timings.c */ extern const struct ata_timing *ata_timing_find_mode(u8 xfer_mode); extern int ata_timing_compute(struct ata_device *, unsigned short, struct ata_timing *, int, int); extern void ata_timing_merge(const struct ata_timing *, const struct ata_timing *, struct ata_timing *, unsigned int); /************************************************************************** * PMP - drivers/ata/libata-pmp.c */ #ifdef CONFIG_SATA_PMP extern const struct ata_port_operations sata_pmp_port_ops; extern int sata_pmp_qc_defer_cmd_switch(struct ata_queued_cmd *qc); extern void sata_pmp_error_handler(struct ata_port *ap); #else /* CONFIG_SATA_PMP */ #define sata_pmp_port_ops sata_port_ops #define sata_pmp_qc_defer_cmd_switch ata_std_qc_defer #define sata_pmp_error_handler ata_std_error_handler #endif /* CONFIG_SATA_PMP */ /************************************************************************** * SFF - drivers/ata/libata-sff.c */ #ifdef CONFIG_ATA_SFF extern const struct ata_port_operations ata_sff_port_ops; extern const struct ata_port_operations ata_bmdma32_port_ops; /* PIO only, sg_tablesize and dma_boundary limits can be removed */ #define ATA_PIO_SHT(drv_name) \ ATA_BASE_SHT(drv_name), \ .sg_tablesize = LIBATA_MAX_PRD, \ .dma_boundary = ATA_DMA_BOUNDARY extern void ata_sff_dev_select(struct ata_port *ap, unsigned int device); extern u8 ata_sff_check_status(struct ata_port *ap); extern void ata_sff_pause(struct ata_port *ap); extern void ata_sff_dma_pause(struct ata_port *ap); extern int ata_sff_busy_sleep(struct ata_port *ap, unsigned long timeout_pat, unsigned long timeout); extern int ata_sff_wait_ready(struct ata_link *link, unsigned long deadline); extern void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf); extern void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf); extern void ata_sff_exec_command(struct ata_port *ap, const struct ata_taskfile *tf); extern unsigned int ata_sff_data_xfer(struct ata_queued_cmd *qc, unsigned char *buf, unsigned int buflen, int rw); extern unsigned int ata_sff_data_xfer32(struct ata_queued_cmd *qc, unsigned char *buf, unsigned int buflen, int rw); extern void ata_sff_irq_on(struct ata_port *ap); extern void ata_sff_irq_clear(struct ata_port *ap); extern int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc, u8 status, int in_wq); extern void ata_sff_queue_work(struct work_struct *work); extern void ata_sff_queue_delayed_work(struct delayed_work *dwork, unsigned long delay); extern void ata_sff_queue_pio_task(struct ata_link *link, unsigned long delay); extern unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc); extern bool ata_sff_qc_fill_rtf(struct ata_queued_cmd *qc); extern unsigned int ata_sff_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc); extern irqreturn_t ata_sff_interrupt(int irq, void *dev_instance); extern void ata_sff_lost_interrupt(struct ata_port *ap); extern void ata_sff_freeze(struct ata_port *ap); extern void ata_sff_thaw(struct ata_port *ap); extern int ata_sff_prereset(struct ata_link *link, unsigned long deadline); extern unsigned int ata_sff_dev_classify(struct ata_device *dev, int present, u8 *r_err); extern int ata_sff_wait_after_reset(struct ata_link *link, unsigned int devmask, unsigned long deadline); extern int ata_sff_softreset(struct ata_link *link, unsigned int *classes, unsigned long deadline); extern int sata_sff_hardreset(struct ata_link *link, unsigned int *class, unsigned long deadline); extern void ata_sff_postreset(struct ata_link *link, unsigned int *classes); extern void ata_sff_drain_fifo(struct ata_queued_cmd *qc); extern void ata_sff_error_handler(struct ata_port *ap); extern void ata_sff_std_ports(struct ata_ioports *ioaddr); #ifdef CONFIG_PCI extern int ata_pci_sff_init_host(struct ata_host *host); extern int ata_pci_sff_prepare_host(struct pci_dev *pdev, const struct ata_port_info * const * ppi, struct ata_host **r_host); extern int ata_pci_sff_activate_host(struct ata_host *host, irq_handler_t irq_handler, struct scsi_host_template *sht); extern int ata_pci_sff_init_one(struct pci_dev *pdev, const struct ata_port_info * const * ppi, struct scsi_host_template *sht, void *host_priv, int hflags); #endif /* CONFIG_PCI */ #ifdef CONFIG_ATA_BMDMA extern const struct ata_port_operations ata_bmdma_port_ops; #define ATA_BMDMA_SHT(drv_name) \ ATA_BASE_SHT(drv_name), \ .sg_tablesize = LIBATA_MAX_PRD, \ .dma_boundary = ATA_DMA_BOUNDARY extern enum ata_completion_errors ata_bmdma_qc_prep(struct ata_queued_cmd *qc); extern unsigned int ata_bmdma_qc_issue(struct ata_queued_cmd *qc); extern enum ata_completion_errors ata_bmdma_dumb_qc_prep(struct ata_queued_cmd *qc); extern unsigned int ata_bmdma_port_intr(struct ata_port *ap, struct ata_queued_cmd *qc); extern irqreturn_t ata_bmdma_interrupt(int irq, void *dev_instance); extern void ata_bmdma_error_handler(struct ata_port *ap); extern void ata_bmdma_post_internal_cmd(struct ata_queued_cmd *qc); extern void ata_bmdma_irq_clear(struct ata_port *ap); extern void ata_bmdma_setup(struct ata_queued_cmd *qc); extern void ata_bmdma_start(struct ata_queued_cmd *qc); extern void ata_bmdma_stop(struct ata_queued_cmd *qc); extern u8 ata_bmdma_status(struct ata_port *ap); extern int ata_bmdma_port_start(struct ata_port *ap); extern int ata_bmdma_port_start32(struct ata_port *ap); #ifdef CONFIG_PCI extern int ata_pci_bmdma_clear_simplex(struct pci_dev *pdev); extern void ata_pci_bmdma_init(struct ata_host *host); extern int ata_pci_bmdma_prepare_host(struct pci_dev *pdev, const struct ata_port_info * const * ppi, struct ata_host **r_host); extern int ata_pci_bmdma_init_one(struct pci_dev *pdev, const struct ata_port_info * const * ppi, struct scsi_host_template *sht, void *host_priv, int hflags); #endif /* CONFIG_PCI */ #endif /* CONFIG_ATA_BMDMA */ /** * ata_sff_busy_wait - Wait for a port status register * @ap: Port to wait for. * @bits: bits that must be clear * @max: number of 10uS waits to perform * * Waits up to max*10 microseconds for the selected bits in the port's * status register to be cleared. * Returns final value of status register. * * LOCKING: * Inherited from caller. */ static inline u8 ata_sff_busy_wait(struct ata_port *ap, unsigned int bits, unsigned int max) { u8 status; do { udelay(10); status = ap->ops->sff_check_status(ap); max--; } while (status != 0xff && (status & bits) && (max > 0)); return status; } /** * ata_wait_idle - Wait for a port to be idle. * @ap: Port to wait for. * * Waits up to 10ms for port's BUSY and DRQ signals to clear. * Returns final value of status register. * * LOCKING: * Inherited from caller. */ static inline u8 ata_wait_idle(struct ata_port *ap) { u8 status = ata_sff_busy_wait(ap, ATA_BUSY | ATA_DRQ, 1000); #ifdef ATA_DEBUG if (status != 0xff && (status & (ATA_BUSY | ATA_DRQ))) ata_port_printk(ap, KERN_DEBUG, "abnormal Status 0x%X\n", status); #endif return status; } #endif /* CONFIG_ATA_SFF */ #endif /* __LINUX_LIBATA_H__ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 /* SPDX-License-Identifier: GPL-2.0 */ /* * Definitions for diskquota-operations. When diskquota is configured these * macros expand to the right source-code. * * Author: Marco van Wieringen <mvw@planets.elm.net> */ #ifndef _LINUX_QUOTAOPS_ #define _LINUX_QUOTAOPS_ #include <linux/fs.h> #define DQUOT_SPACE_WARN 0x1 #define DQUOT_SPACE_RESERVE 0x2 #define DQUOT_SPACE_NOFAIL 0x4 static inline struct quota_info *sb_dqopt(struct super_block *sb) { return &sb->s_dquot; } /* i_mutex must being held */ static inline bool is_quota_modification(struct inode *inode, struct iattr *ia) { return (ia->ia_valid & ATTR_SIZE) || (ia->ia_valid & ATTR_UID && !uid_eq(ia->ia_uid, inode->i_uid)) || (ia->ia_valid & ATTR_GID && !gid_eq(ia->ia_gid, inode->i_gid)); } #if defined(CONFIG_QUOTA) #define quota_error(sb, fmt, args...) \ __quota_error((sb), __func__, fmt , ## args) extern __printf(3, 4) void __quota_error(struct super_block *sb, const char *func, const char *fmt, ...); /* * declaration of quota_function calls in kernel. */ int dquot_initialize(struct inode *inode); bool dquot_initialize_needed(struct inode *inode); void dquot_drop(struct inode *inode); struct dquot *dqget(struct super_block *sb, struct kqid qid); static inline struct dquot *dqgrab(struct dquot *dquot) { /* Make sure someone else has active reference to dquot */ WARN_ON_ONCE(!atomic_read(&dquot->dq_count)); WARN_ON_ONCE(!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)); atomic_inc(&dquot->dq_count); return dquot; } static inline bool dquot_is_busy(struct dquot *dquot) { if (test_bit(DQ_MOD_B, &dquot->dq_flags)) return true; if (atomic_read(&dquot->dq_count) > 1) return true; return false; } void dqput(struct dquot *dquot); int dquot_scan_active(struct super_block *sb, int (*fn)(struct dquot *dquot, unsigned long priv), unsigned long priv); struct dquot *dquot_alloc(struct super_block *sb, int type); void dquot_destroy(struct dquot *dquot); int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags); void __dquot_free_space(struct inode *inode, qsize_t number, int flags); int dquot_alloc_inode(struct inode *inode); int dquot_claim_space_nodirty(struct inode *inode, qsize_t number); void dquot_free_inode(struct inode *inode); void dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number); int dquot_disable(struct super_block *sb, int type, unsigned int flags); /* Suspend quotas on remount RO */ static inline int dquot_suspend(struct super_block *sb, int type) { return dquot_disable(sb, type, DQUOT_SUSPENDED); } int dquot_resume(struct super_block *sb, int type); int dquot_commit(struct dquot *dquot); int dquot_acquire(struct dquot *dquot); int dquot_release(struct dquot *dquot); int dquot_commit_info(struct super_block *sb, int type); int dquot_get_next_id(struct super_block *sb, struct kqid *qid); int dquot_mark_dquot_dirty(struct dquot *dquot); int dquot_file_open(struct inode *inode, struct file *file); int dquot_load_quota_sb(struct super_block *sb, int type, int format_id, unsigned int flags); int dquot_load_quota_inode(struct inode *inode, int type, int format_id, unsigned int flags); int dquot_quota_on(struct super_block *sb, int type, int format_id, const struct path *path); int dquot_quota_on_mount(struct super_block *sb, char *qf_name, int format_id, int type); int dquot_quota_off(struct super_block *sb, int type); int dquot_writeback_dquots(struct super_block *sb, int type); int dquot_quota_sync(struct super_block *sb, int type); int dquot_get_state(struct super_block *sb, struct qc_state *state); int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii); int dquot_get_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int dquot_get_next_dqblk(struct super_block *sb, struct kqid *id, struct qc_dqblk *di); int dquot_set_dqblk(struct super_block *sb, struct kqid id, struct qc_dqblk *di); int __dquot_transfer(struct inode *inode, struct dquot **transfer_to); int dquot_transfer(struct inode *inode, struct iattr *iattr); static inline struct mem_dqinfo *sb_dqinfo(struct super_block *sb, int type) { return sb_dqopt(sb)->info + type; } /* * Functions for checking status of quota */ static inline bool sb_has_quota_usage_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_USAGE_ENABLED, type); } static inline bool sb_has_quota_limits_enabled(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_LIMITS_ENABLED, type); } static inline bool sb_has_quota_suspended(struct super_block *sb, int type) { return sb_dqopt(sb)->flags & dquot_state_flag(DQUOT_SUSPENDED, type); } static inline unsigned sb_any_quota_suspended(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_SUSPENDED); } /* Does kernel know about any quota information for given sb + type? */ static inline bool sb_has_quota_loaded(struct super_block *sb, int type) { /* Currently if anything is on, then quota usage is on as well */ return sb_has_quota_usage_enabled(sb, type); } static inline unsigned sb_any_quota_loaded(struct super_block *sb) { return dquot_state_types(sb_dqopt(sb)->flags, DQUOT_USAGE_ENABLED); } static inline bool sb_has_quota_active(struct super_block *sb, int type) { return sb_has_quota_loaded(sb, type) && !sb_has_quota_suspended(sb, type); } /* * Operations supported for diskquotas. */ extern const struct dquot_operations dquot_operations; extern const struct quotactl_ops dquot_quotactl_sysfile_ops; #else static inline int sb_has_quota_usage_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_limits_enabled(struct super_block *sb, int type) { return 0; } static inline int sb_has_quota_suspended(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_suspended(struct super_block *sb) { return 0; } /* Does kernel know about any quota information for given sb + type? */ static inline int sb_has_quota_loaded(struct super_block *sb, int type) { return 0; } static inline int sb_any_quota_loaded(struct super_block *sb) { return 0; } static inline int sb_has_quota_active(struct super_block *sb, int type) { return 0; } static inline int dquot_initialize(struct inode *inode) { return 0; } static inline bool dquot_initialize_needed(struct inode *inode) { return false; } static inline void dquot_drop(struct inode *inode) { } static inline int dquot_alloc_inode(struct inode *inode) { return 0; } static inline void dquot_free_inode(struct inode *inode) { } static inline int dquot_transfer(struct inode *inode, struct iattr *iattr) { return 0; } static inline int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_add_bytes(inode, number); return 0; } static inline void __dquot_free_space(struct inode *inode, qsize_t number, int flags) { if (!(flags & DQUOT_SPACE_RESERVE)) inode_sub_bytes(inode, number); } static inline int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { inode_add_bytes(inode, number); return 0; } static inline int dquot_reclaim_space_nodirty(struct inode *inode, qsize_t number) { inode_sub_bytes(inode, number); return 0; } static inline int dquot_disable(struct super_block *sb, int type, unsigned int flags) { return 0; } static inline int dquot_suspend(struct super_block *sb, int type) { return 0; } static inline int dquot_resume(struct super_block *sb, int type) { return 0; } #define dquot_file_open generic_file_open static inline int dquot_writeback_dquots(struct super_block *sb, int type) { return 0; } #endif /* CONFIG_QUOTA */ static inline int dquot_alloc_space_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN); } static inline void dquot_alloc_space_nofail(struct inode *inode, qsize_t nr) { __dquot_alloc_space(inode, nr, DQUOT_SPACE_WARN|DQUOT_SPACE_NOFAIL); mark_inode_dirty_sync(inode); } static inline int dquot_alloc_space(struct inode *inode, qsize_t nr) { int ret; ret = dquot_alloc_space_nodirty(inode, nr); if (!ret) { /* * Mark inode fully dirty. Since we are allocating blocks, inode * would become fully dirty soon anyway and it reportedly * reduces lock contention. */ mark_inode_dirty(inode); } return ret; } static inline int dquot_alloc_block_nodirty(struct inode *inode, qsize_t nr) { return dquot_alloc_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_alloc_block_nofail(struct inode *inode, qsize_t nr) { dquot_alloc_space_nofail(inode, nr << inode->i_blkbits); } static inline int dquot_alloc_block(struct inode *inode, qsize_t nr) { return dquot_alloc_space(inode, nr << inode->i_blkbits); } static inline int dquot_prealloc_block_nodirty(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, 0); } static inline int dquot_prealloc_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_prealloc_block_nodirty(inode, nr); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline int dquot_reserve_block(struct inode *inode, qsize_t nr) { return __dquot_alloc_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_WARN|DQUOT_SPACE_RESERVE); } static inline int dquot_claim_block(struct inode *inode, qsize_t nr) { int ret; ret = dquot_claim_space_nodirty(inode, nr << inode->i_blkbits); if (!ret) mark_inode_dirty_sync(inode); return ret; } static inline void dquot_reclaim_block(struct inode *inode, qsize_t nr) { dquot_reclaim_space_nodirty(inode, nr << inode->i_blkbits); mark_inode_dirty_sync(inode); } static inline void dquot_free_space_nodirty(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr, 0); } static inline void dquot_free_space(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr); mark_inode_dirty_sync(inode); } static inline void dquot_free_block_nodirty(struct inode *inode, qsize_t nr) { dquot_free_space_nodirty(inode, nr << inode->i_blkbits); } static inline void dquot_free_block(struct inode *inode, qsize_t nr) { dquot_free_space(inode, nr << inode->i_blkbits); } static inline void dquot_release_reservation_block(struct inode *inode, qsize_t nr) { __dquot_free_space(inode, nr << inode->i_blkbits, DQUOT_SPACE_RESERVE); } unsigned int qtype_enforce_flag(int type); #endif /* _LINUX_QUOTAOPS_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * INET An implementation of the TCP/IP protocol suite for the LINUX * operating system. INET is implemented using the BSD Socket * interface as the means of communication with the user level. * * Definitions for inet_sock * * Authors: Many, reorganised here by * Arnaldo Carvalho de Melo <acme@mandriva.com> */ #ifndef _INET_SOCK_H #define _INET_SOCK_H #include <linux/bitops.h> #include <linux/string.h> #include <linux/types.h> #include <linux/jhash.h> #include <linux/netdevice.h> #include <net/flow.h> #include <net/sock.h> #include <net/request_sock.h> #include <net/netns/hash.h> #include <net/tcp_states.h> #include <net/l3mdev.h> /** struct ip_options - IP Options * * @faddr - Saved first hop address * @nexthop - Saved nexthop address in LSRR and SSRR * @is_strictroute - Strict source route * @srr_is_hit - Packet destination addr was our one * @is_changed - IP checksum more not valid * @rr_needaddr - Need to record addr of outgoing dev * @ts_needtime - Need to record timestamp * @ts_needaddr - Need to record addr of outgoing dev */ struct ip_options { __be32 faddr; __be32 nexthop; unsigned char optlen; unsigned char srr; unsigned char rr; unsigned char ts; unsigned char is_strictroute:1, srr_is_hit:1, is_changed:1, rr_needaddr:1, ts_needtime:1, ts_needaddr:1; unsigned char router_alert; unsigned char cipso; unsigned char __pad2; unsigned char __data[]; }; struct ip_options_rcu { struct rcu_head rcu; struct ip_options opt; }; struct ip_options_data { struct ip_options_rcu opt; char data[40]; }; struct inet_request_sock { struct request_sock req; #define ir_loc_addr req.__req_common.skc_rcv_saddr #define ir_rmt_addr req.__req_common.skc_daddr #define ir_num req.__req_common.skc_num #define ir_rmt_port req.__req_common.skc_dport #define ir_v6_rmt_addr req.__req_common.skc_v6_daddr #define ir_v6_loc_addr req.__req_common.skc_v6_rcv_saddr #define ir_iif req.__req_common.skc_bound_dev_if #define ir_cookie req.__req_common.skc_cookie #define ireq_net req.__req_common.skc_net #define ireq_state req.__req_common.skc_state #define ireq_family req.__req_common.skc_family u16 snd_wscale : 4, rcv_wscale : 4, tstamp_ok : 1, sack_ok : 1, wscale_ok : 1, ecn_ok : 1, acked : 1, no_srccheck: 1, smc_ok : 1; u32 ir_mark; union { struct ip_options_rcu __rcu *ireq_opt; #if IS_ENABLED(CONFIG_IPV6) struct { struct ipv6_txoptions *ipv6_opt; struct sk_buff *pktopts; }; #endif }; }; static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) { return (struct inet_request_sock *)sk; } static inline u32 inet_request_mark(const struct sock *sk, struct sk_buff *skb) { if (!sk->sk_mark && sock_net(sk)->ipv4.sysctl_tcp_fwmark_accept) return skb->mark; return sk->sk_mark; } static inline int inet_request_bound_dev_if(const struct sock *sk, struct sk_buff *skb) { #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); if (!sk->sk_bound_dev_if && net->ipv4.sysctl_tcp_l3mdev_accept) return l3mdev_master_ifindex_by_index(net, skb->skb_iif); #endif return sk->sk_bound_dev_if; } static inline int inet_sk_bound_l3mdev(const struct sock *sk) { #ifdef CONFIG_NET_L3_MASTER_DEV struct net *net = sock_net(sk); if (!net->ipv4.sysctl_tcp_l3mdev_accept) return l3mdev_master_ifindex_by_index(net, sk->sk_bound_dev_if); #endif return 0; } static inline bool inet_bound_dev_eq(bool l3mdev_accept, int bound_dev_if, int dif, int sdif) { if (!bound_dev_if) return !sdif || l3mdev_accept; return bound_dev_if == dif || bound_dev_if == sdif; } struct inet_cork { unsigned int flags; __be32 addr; struct ip_options *opt; unsigned int fragsize; int length; /* Total length of all frames */ struct dst_entry *dst; u8 tx_flags; __u8 ttl; __s16 tos; char priority; __u16 gso_size; u64 transmit_time; u32 mark; }; struct inet_cork_full { struct inet_cork base; struct flowi fl; }; struct ip_mc_socklist; struct ipv6_pinfo; struct rtable; /** struct inet_sock - representation of INET sockets * * @sk - ancestor class * @pinet6 - pointer to IPv6 control block * @inet_daddr - Foreign IPv4 addr * @inet_rcv_saddr - Bound local IPv4 addr * @inet_dport - Destination port * @inet_num - Local port * @inet_saddr - Sending source * @uc_ttl - Unicast TTL * @inet_sport - Source port * @inet_id - ID counter for DF pkts * @tos - TOS * @mc_ttl - Multicasting TTL * @is_icsk - is this an inet_connection_sock? * @uc_index - Unicast outgoing device index * @mc_index - Multicast device index * @mc_list - Group array * @cork - info to build ip hdr on each ip frag while socket is corked */ struct inet_sock { /* sk and pinet6 has to be the first two members of inet_sock */ struct sock sk; #if IS_ENABLED(CONFIG_IPV6) struct ipv6_pinfo *pinet6; #endif /* Socket demultiplex comparisons on incoming packets. */ #define inet_daddr sk.__sk_common.skc_daddr #define inet_rcv_saddr sk.__sk_common.skc_rcv_saddr #define inet_dport sk.__sk_common.skc_dport #define inet_num sk.__sk_common.skc_num __be32 inet_saddr; __s16 uc_ttl; __u16 cmsg_flags; __be16 inet_sport; __u16 inet_id; struct ip_options_rcu __rcu *inet_opt; int rx_dst_ifindex; __u8 tos; __u8 min_ttl; __u8 mc_ttl; __u8 pmtudisc; __u8 recverr:1, is_icsk:1, freebind:1, hdrincl:1, mc_loop:1, transparent:1, mc_all:1, nodefrag:1; __u8 bind_address_no_port:1, recverr_rfc4884:1, defer_connect:1; /* Indicates that fastopen_connect is set * and cookie exists so we defer connect * until first data frame is written */ __u8 rcv_tos; __u8 convert_csum; int uc_index; int mc_index; __be32 mc_addr; struct ip_mc_socklist __rcu *mc_list; struct inet_cork_full cork; }; #define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ #define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ /* cmsg flags for inet */ #define IP_CMSG_PKTINFO BIT(0) #define IP_CMSG_TTL BIT(1) #define IP_CMSG_TOS BIT(2) #define IP_CMSG_RECVOPTS BIT(3) #define IP_CMSG_RETOPTS BIT(4) #define IP_CMSG_PASSSEC BIT(5) #define IP_CMSG_ORIGDSTADDR BIT(6) #define IP_CMSG_CHECKSUM BIT(7) #define IP_CMSG_RECVFRAGSIZE BIT(8) /** * sk_to_full_sk - Access to a full socket * @sk: pointer to a socket * * SYNACK messages might be attached to request sockets. * Some places want to reach the listener in this case. */ static inline struct sock *sk_to_full_sk(struct sock *sk) { #ifdef CONFIG_INET if (sk && sk->sk_state == TCP_NEW_SYN_RECV) sk = inet_reqsk(sk)->rsk_listener; #endif return sk; } /* sk_to_full_sk() variant with a const argument */ static inline const struct sock *sk_const_to_full_sk(const struct sock *sk) { #ifdef CONFIG_INET if (sk && sk->sk_state == TCP_NEW_SYN_RECV) sk = ((const struct request_sock *)sk)->rsk_listener; #endif return sk; } static inline struct sock *skb_to_full_sk(const struct sk_buff *skb) { return sk_to_full_sk(skb->sk); } static inline struct inet_sock *inet_sk(const struct sock *sk) { return (struct inet_sock *)sk; } static inline void __inet_sk_copy_descendant(struct sock *sk_to, const struct sock *sk_from, const int ancestor_size) { memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, sk_from->sk_prot->obj_size - ancestor_size); } int inet_sk_rebuild_header(struct sock *sk); /** * inet_sk_state_load - read sk->sk_state for lockless contexts * @sk: socket pointer * * Paired with inet_sk_state_store(). Used in places we don't hold socket lock: * tcp_diag_get_info(), tcp_get_info(), tcp_poll(), get_tcp4_sock() ... */ static inline int inet_sk_state_load(const struct sock *sk) { /* state change might impact lockless readers. */ return smp_load_acquire(&sk->sk_state); } /** * inet_sk_state_store - update sk->sk_state * @sk: socket pointer * @newstate: new state * * Paired with inet_sk_state_load(). Should be used in contexts where * state change might impact lockless readers. */ void inet_sk_state_store(struct sock *sk, int newstate); void inet_sk_set_state(struct sock *sk, int state); static inline unsigned int __inet_ehashfn(const __be32 laddr, const __u16 lport, const __be32 faddr, const __be16 fport, u32 initval) { return jhash_3words((__force __u32) laddr, (__force __u32) faddr, ((__u32) lport) << 16 | (__force __u32)fport, initval); } struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener, bool attach_listener); static inline __u8 inet_sk_flowi_flags(const struct sock *sk) { __u8 flags = 0; if (inet_sk(sk)->transparent || inet_sk(sk)->hdrincl) flags |= FLOWI_FLAG_ANYSRC; return flags; } static inline void inet_inc_convert_csum(struct sock *sk) { inet_sk(sk)->convert_csum++; } static inline void inet_dec_convert_csum(struct sock *sk) { if (inet_sk(sk)->convert_csum > 0) inet_sk(sk)->convert_csum--; } static inline bool inet_get_convert_csum(struct sock *sk) { return !!inet_sk(sk)->convert_csum; } static inline bool inet_can_nonlocal_bind(struct net *net, struct inet_sock *inet) { return net->ipv4.sysctl_ip_nonlocal_bind || inet->freebind || inet->transparent; } #endif /* _INET_SOCK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_MROUTE_H #define __LINUX_MROUTE_H #include <linux/in.h> #include <linux/pim.h> #include <net/fib_rules.h> #include <net/fib_notifier.h> #include <uapi/linux/mroute.h> #include <linux/mroute_base.h> #include <linux/sockptr.h> #ifdef CONFIG_IP_MROUTE static inline int ip_mroute_opt(int opt) { return opt >= MRT_BASE && opt <= MRT_MAX; } int ip_mroute_setsockopt(struct sock *, int, sockptr_t, unsigned int); int ip_mroute_getsockopt(struct sock *, int, char __user *, int __user *); int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg); int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg); int ip_mr_init(void); bool ipmr_rule_default(const struct fib_rule *rule); #else static inline int ip_mroute_setsockopt(struct sock *sock, int optname, sockptr_t optval, unsigned int optlen) { return -ENOPROTOOPT; } static inline int ip_mroute_getsockopt(struct sock *sock, int optname, char __user *optval, int __user *optlen) { return -ENOPROTOOPT; } static inline int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) { return -ENOIOCTLCMD; } static inline int ip_mr_init(void) { return 0; } static inline int ip_mroute_opt(int opt) { return 0; } static inline bool ipmr_rule_default(const struct fib_rule *rule) { return true; } #endif #define VIFF_STATIC 0x8000 struct mfc_cache_cmp_arg { __be32 mfc_mcastgrp; __be32 mfc_origin; }; /** * struct mfc_cache - multicast routing entries * @_c: Common multicast routing information; has to be first [for casting] * @mfc_mcastgrp: destination multicast group address * @mfc_origin: source address * @cmparg: used for rhashtable comparisons */ struct mfc_cache { struct mr_mfc _c; union { struct { __be32 mfc_mcastgrp; __be32 mfc_origin; }; struct mfc_cache_cmp_arg cmparg; }; }; struct rtmsg; int ipmr_get_route(struct net *net, struct sk_buff *skb, __be32 saddr, __be32 daddr, struct rtmsg *rtm, u32 portid); #endif
1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_H #define _ASM_X86_UACCESS_H /* * User space memory access functions */ #include <linux/compiler.h> #include <linux/kasan-checks.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> #include <asm/smap.h> #include <asm/extable.h> /* * Test whether a block of memory is a valid user space address. * Returns 0 if the range is valid, nonzero otherwise. */ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, unsigned long limit) { /* * If we have used "sizeof()" for the size, * we know it won't overflow the limit (but * it might overflow the 'addr', so it's * important to subtract the size from the * limit, not add it to the address). */ if (__builtin_constant_p(size)) return unlikely(addr > limit - size); /* Arbitrary sizes? Be careful about overflow */ addr += size; if (unlikely(addr < size)) return true; return unlikely(addr > limit); } #define __range_not_ok(addr, size, limit) \ ({ \ __chk_user_ptr(addr); \ __chk_range_not_ok((unsigned long __force)(addr), size, limit); \ }) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline bool pagefault_disabled(void); # define WARN_ON_IN_IRQ() \ WARN_ON_ONCE(!in_task() && !pagefault_disabled()) #else # define WARN_ON_IN_IRQ() #endif /** * access_ok - Checks if a user space pointer is valid * @addr: User space pointer to start of block to check * @size: Size of block to check * * Context: User context only. This function may sleep if pagefaults are * enabled. * * Checks if a pointer to a block of memory in user space is valid. * * Note that, depending on architecture, this function probably just * checks that the pointer is in the user space range - after calling * this function, memory access functions may still return -EFAULT. * * Return: true (nonzero) if the memory block may be valid, false (zero) * if it is definitely invalid. */ #define access_ok(addr, size) \ ({ \ WARN_ON_IN_IRQ(); \ likely(!__range_not_ok(addr, size, TASK_SIZE_MAX)); \ }) extern int __get_user_1(void); extern int __get_user_2(void); extern int __get_user_4(void); extern int __get_user_8(void); extern int __get_user_nocheck_1(void); extern int __get_user_nocheck_2(void); extern int __get_user_nocheck_4(void); extern int __get_user_nocheck_8(void); extern int __get_user_bad(void); #define __uaccess_begin() stac() #define __uaccess_end() clac() #define __uaccess_begin_nospec() \ ({ \ stac(); \ barrier_nospec(); \ }) /* * This is the smallest unsigned integer type that can fit a value * (up to 'long long') */ #define __inttype(x) __typeof__( \ __typefits(x,char, \ __typefits(x,short, \ __typefits(x,int, \ __typefits(x,long,0ULL))))) #define __typefits(x,type,not) \ __builtin_choose_expr(sizeof(x)<=sizeof(type),(unsigned type)0,not) /* * This is used for both get_user() and __get_user() to expand to * the proper special function call that has odd calling conventions * due to returning both a value and an error, and that depends on * the size of the pointer passed in. * * Careful: we have to cast the result to the type of the pointer * for sign reasons. * * The use of _ASM_DX as the register specifier is a bit of a * simplification, as gcc only cares about it as the starting point * and not size: for a 64-bit value it will use %ecx:%edx on 32 bits * (%ecx being the next register in gcc's x86 register sequence), and * %rdx on 64 bits. * * Clang/LLVM cares about the size of the register, but still wants * the base register for something that ends up being a pair. */ #define do_get_user_call(fn,x,ptr) \ ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ __chk_user_ptr(ptr); \ asm volatile("call __" #fn "_%P4" \ : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) /** * get_user - Get a simple variable from user space. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define get_user(x,ptr) ({ might_fault(); do_get_user_call(get_user,x,ptr); }) /** * __get_user - Get a simple variable from user space, with less checking. * @x: Variable to store result. * @ptr: Source address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple variable from user space to kernel * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and the result of * dereferencing @ptr must be assignable to @x without a cast. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. * On error, the variable @x is set to zero. */ #define __get_user(x,ptr) do_get_user_call(get_user_nocheck,x,ptr) #ifdef CONFIG_X86_32 #define __put_user_goto_u64(x, addr, label) \ asm_volatile_goto("\n" \ "1: movl %%eax,0(%1)\n" \ "2: movl %%edx,4(%1)\n" \ _ASM_EXTABLE_UA(1b, %l2) \ _ASM_EXTABLE_UA(2b, %l2) \ : : "A" (x), "r" (addr) \ : : label) #else #define __put_user_goto_u64(x, ptr, label) \ __put_user_goto(x, ptr, "q", "er", label) #endif extern void __put_user_bad(void); /* * Strange magic calling convention: pointer in %ecx, * value in %eax(:%edx), return value in %ecx. clobbers %rbx */ extern void __put_user_1(void); extern void __put_user_2(void); extern void __put_user_4(void); extern void __put_user_8(void); extern void __put_user_nocheck_1(void); extern void __put_user_nocheck_2(void); extern void __put_user_nocheck_4(void); extern void __put_user_nocheck_8(void); /* * ptr must be evaluated and assigned to the temporary __ptr_pu before * the assignment of x to __val_pu, to avoid any function calls * involved in the ptr expression (possibly implicitly generated due * to KASAN) from clobbering %ax. */ #define do_put_user_call(fn,x,ptr) \ ({ \ int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ __chk_user_ptr(ptr); \ __ptr_pu = (ptr); \ __val_pu = (x); \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ : "0" (__ptr_pu), \ "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ __builtin_expect(__ret_pu, 0); \ }) /** * put_user - Write a simple value into user space. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Return: zero on success, or -EFAULT on error. */ #define put_user(x, ptr) ({ might_fault(); do_put_user_call(put_user,x,ptr); }) /** * __put_user - Write a simple value into user space, with less checking. * @x: Value to copy to user space. * @ptr: Destination address, in user space. * * Context: User context only. This function may sleep if pagefaults are * enabled. * * This macro copies a single simple value from kernel space to user * space. It supports simple types like char and int, but not larger * data types like structures or arrays. * * @ptr must have pointer-to-simple-variable type, and @x must be assignable * to the result of dereferencing @ptr. * * Caller must check the pointer with access_ok() before calling this * function. * * Return: zero on success, or -EFAULT on error. */ #define __put_user(x, ptr) do_put_user_call(put_user_nocheck,x,ptr) #define __put_user_size(x, ptr, size, label) \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ __put_user_goto(x, ptr, "b", "iq", label); \ break; \ case 2: \ __put_user_goto(x, ptr, "w", "ir", label); \ break; \ case 4: \ __put_user_goto(x, ptr, "l", "ir", label); \ break; \ case 8: \ __put_user_goto_u64(x, ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, label) do { \ unsigned int __gu_low, __gu_high; \ const unsigned int __user *__gu_ptr; \ __gu_ptr = (const void __user *)(ptr); \ __get_user_asm(__gu_low, __gu_ptr, "l", "=r", label); \ __get_user_asm(__gu_high, __gu_ptr+1, "l", "=r", label); \ (x) = ((unsigned long long)__gu_high << 32) | __gu_low; \ } while (0) #else #define __get_user_asm_u64(x, ptr, label) \ __get_user_asm(x, ptr, "q", "=r", label) #endif #define __get_user_size(x, ptr, size, label) \ do { \ __chk_user_ptr(ptr); \ switch (size) { \ unsigned char x_u8__; \ case 1: \ __get_user_asm(x_u8__, ptr, "b", "=q", label); \ (x) = x_u8__; \ break; \ case 2: \ __get_user_asm(x, ptr, "w", "=r", label); \ break; \ case 4: \ __get_user_asm(x, ptr, "l", "=r", label); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, label); \ break; \ default: \ (x) = __get_user_bad(); \ } \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %[umem],%[output]\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : [output] ltype(x) \ : [umem] "m" (__m(addr)) \ : : label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #ifdef CONFIG_X86_32 #define __get_user_asm_u64(x, ptr, retval) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ asm volatile("\n" \ "1: movl %[lowbits],%%eax\n" \ "2: movl %[highbits],%%edx\n" \ "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: mov %[efault],%[errout]\n" \ " xorl %%eax,%%eax\n" \ " xorl %%edx,%%edx\n" \ " jmp 3b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 4b) \ _ASM_EXTABLE_UA(2b, 4b) \ : [errout] "=r" (retval), \ [output] "=&A"(x) \ : [lowbits] "m" (__m(__ptr)), \ [highbits] "m" __m(((u32 __user *)(__ptr)) + 1), \ [efault] "i" (-EFAULT), "0" (retval)); \ }) #else #define __get_user_asm_u64(x, ptr, retval) \ __get_user_asm(x, ptr, retval, "q", "=r") #endif #define __get_user_size(x, ptr, size, retval) \ do { \ unsigned char x_u8__; \ \ retval = 0; \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ __get_user_asm(x_u8__, ptr, retval, "b", "=q"); \ (x) = x_u8__; \ break; \ case 2: \ __get_user_asm(x, ptr, retval, "w", "=r"); \ break; \ case 4: \ __get_user_asm(x, ptr, retval, "l", "=r"); \ break; \ case 8: \ __get_user_asm_u64(x, ptr, retval); \ break; \ default: \ (x) = __get_user_bad(); \ } \ } while (0) #define __get_user_asm(x, addr, err, itype, ltype) \ asm volatile("\n" \ "1: mov"itype" %[umem],%[output]\n" \ "2:\n" \ ".section .fixup,\"ax\"\n" \ "3: mov %[efault],%[errout]\n" \ " xorl %k[output],%k[output]\n" \ " jmp 2b\n" \ ".previous\n" \ _ASM_EXTABLE_UA(1b, 3b) \ : [errout] "=r" (err), \ [output] ltype(x) \ : [umem] "m" (__m(addr)), \ [efault] "i" (-EFAULT), "0" (err)) #endif // CONFIG_CC_ASM_GOTO_OUTPUT /* FIXME: this hack is definitely wrong -AK */ struct __large_struct { unsigned long buf[100]; }; #define __m(x) (*(struct __large_struct __user *)(x)) /* * Tell gcc we read from memory instead of writing: this is because * we do not write to any memory gcc knows about, so there are no * aliasing issues. */ #define __put_user_goto(x, addr, itype, ltype, label) \ asm_volatile_goto("\n" \ "1: mov"itype" %0,%1\n" \ _ASM_EXTABLE_UA(1b, %l2) \ : : ltype(x), "m" (__m(addr)) \ : : label) extern unsigned long copy_from_user_nmi(void *to, const void __user *from, unsigned long n); extern __must_check long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); unsigned long __must_check clear_user(void __user *mem, unsigned long len); unsigned long __must_check __clear_user(void __user *mem, unsigned long len); #ifdef CONFIG_ARCH_HAS_COPY_MC unsigned long __must_check copy_mc_to_kernel(void *to, const void *from, unsigned len); #define copy_mc_to_kernel copy_mc_to_kernel unsigned long __must_check copy_mc_to_user(void *to, const void *from, unsigned len); #endif /* * movsl can be slow when source and dest are not both 8-byte aligned */ #ifdef CONFIG_X86_INTEL_USERCOPY extern struct movsl_mask { int mask; } ____cacheline_aligned_in_smp movsl_mask; #endif #define ARCH_HAS_NOCACHE_UACCESS 1 #ifdef CONFIG_X86_32 # include <asm/uaccess_32.h> #else # include <asm/uaccess_64.h> #endif /* * The "unsafe" user accesses aren't really "unsafe", but the naming * is a big fat warning: you have to not only do the access_ok() * checking before using them, but you have to surround them with the * user_access_begin/end() pair. */ static __must_check __always_inline bool user_access_begin(const void __user *ptr, size_t len) { if (unlikely(!access_ok(ptr,len))) return 0; __uaccess_begin_nospec(); return 1; } #define user_access_begin(a,b) user_access_begin(a,b) #define user_access_end() __uaccess_end() #define user_access_save() smap_save() #define user_access_restore(x) smap_restore(x) #define unsafe_put_user(x, ptr, label) \ __put_user_size((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr)), label) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), err_label); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define unsafe_get_user(x, ptr, err_label) \ do { \ int __gu_err; \ __inttype(*(ptr)) __gu_val; \ __get_user_size(__gu_val, (ptr), sizeof(*(ptr)), __gu_err); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ if (unlikely(__gu_err)) goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT /* * We want the unsafe accessors to always be inlined and use * the error labels - thus the macro games. */ #define unsafe_copy_loop(dst, src, len, type, label) \ while (len >= sizeof(type)) { \ unsafe_put_user(*(type *)(src),(type __user *)(dst),label); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ } #define unsafe_copy_to_user(_dst,_src,_len,label) \ do { \ char __user *__ucu_dst = (_dst); \ const char *__ucu_src = (_src); \ size_t __ucu_len = (_len); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u64, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u32, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u16, label); \ unsafe_copy_loop(__ucu_dst, __ucu_src, __ucu_len, u8, label); \ } while (0) #define HAVE_GET_KERNEL_NOFAULT #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), err_label) #else // !CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __get_kernel_nofault(dst, src, type, err_label) \ do { \ int __kr_err; \ \ __get_user_size(*((type *)(dst)), (__force type __user *)(src), \ sizeof(type), __kr_err); \ if (unlikely(__kr_err)) \ goto err_label; \ } while (0) #endif // CONFIG_CC_HAS_ASM_GOTO_OUTPUT #define __put_kernel_nofault(dst, src, type, err_label) \ __put_user_size(*((type *)(src)), (__force type __user *)(dst), \ sizeof(type), err_label) #endif /* _ASM_X86_UACCESS_H */
1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 3359 3360 3361 3362 3363 3364 3365 3366 3367 3368 3369 3370 3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384 3385 3386 3387 3388 3389 3390 3391 3392 3393 3394 3395 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_FS_H #define _LINUX_FS_H #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> #include <linux/dcache.h> #include <linux/path.h> #include <linux/stat.h> #include <linux/cache.h> #include <linux/list.h> #include <linux/list_lru.h> #include <linux/llist.h> #include <linux/radix-tree.h> #include <linux/xarray.h> #include <linux/rbtree.h> #include <linux/init.h> #include <linux/pid.h> #include <linux/bug.h> #include <linux/mutex.h> #include <linux/rwsem.h> #include <linux/mm_types.h> #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> #include <linux/migrate_mode.h> #include <linux/uidgid.h> #include <linux/lockdep.h> #include <linux/percpu-rwsem.h> #include <linux/workqueue.h> #include <linux/delayed_call.h> #include <linux/uuid.h> #include <linux/errseq.h> #include <linux/ioprio.h> #include <linux/fs_types.h> #include <linux/build_bug.h> #include <linux/stddef.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> struct backing_dev_info; struct bdi_writeback; struct bio; struct export_operations; struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; struct kobject; struct pipe_inode_info; struct poll_table_struct; struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; struct fscrypt_info; struct fscrypt_operations; struct fsverity_info; struct fsverity_operations; struct fs_context; struct fs_parameter_spec; extern void __init inode_init(void); extern void __init inode_init_early(void); extern void __init files_init(void); extern void __init files_maxfiles_init(void); extern struct files_stat_struct files_stat; extern unsigned long get_max_files(void); extern unsigned int sysctl_nr_open; extern struct inodes_stat_t inodes_stat; extern int leases_enable, lease_break_time; extern int sysctl_protected_symlinks; extern int sysctl_protected_hardlinks; extern int sysctl_protected_fifos; extern int sysctl_protected_regular; typedef __kernel_rwf_t rwf_t; struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef int (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ssize_t bytes, void *private); #define MAY_EXEC 0x00000001 #define MAY_WRITE 0x00000002 #define MAY_READ 0x00000004 #define MAY_APPEND 0x00000008 #define MAY_ACCESS 0x00000010 #define MAY_OPEN 0x00000020 #define MAY_CHDIR 0x00000040 /* called from RCU mode, don't block */ #define MAY_NOT_BLOCK 0x00000080 /* * flags in file.f_mode. Note that FMODE_READ and FMODE_WRITE must correspond * to O_WRONLY and O_RDWR via the strange trick in do_dentry_open() */ /* file is open for reading */ #define FMODE_READ ((__force fmode_t)0x1) /* file is open for writing */ #define FMODE_WRITE ((__force fmode_t)0x2) /* file is seekable */ #define FMODE_LSEEK ((__force fmode_t)0x4) /* file can be accessed using pread */ #define FMODE_PREAD ((__force fmode_t)0x8) /* file can be accessed using pwrite */ #define FMODE_PWRITE ((__force fmode_t)0x10) /* File is opened for execution with sys_execve / sys_uselib */ #define FMODE_EXEC ((__force fmode_t)0x20) /* File is opened with O_NDELAY (only set for block devices) */ #define FMODE_NDELAY ((__force fmode_t)0x40) /* File is opened with O_EXCL (only set for block devices) */ #define FMODE_EXCL ((__force fmode_t)0x80) /* File is opened using open(.., 3, ..) and is writeable only for ioctls (specialy hack for floppy.c) */ #define FMODE_WRITE_IOCTL ((__force fmode_t)0x100) /* 32bit hashes as llseek() offset (for directories) */ #define FMODE_32BITHASH ((__force fmode_t)0x200) /* 64bit hashes as llseek() offset (for directories) */ #define FMODE_64BITHASH ((__force fmode_t)0x400) /* * Don't update ctime and mtime. * * Currently a special hack for the XFS open_by_handle ioctl, but we'll * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. */ #define FMODE_NOCMTIME ((__force fmode_t)0x800) /* Expect random access pattern */ #define FMODE_RANDOM ((__force fmode_t)0x1000) /* File is huge (eg. /dev/kmem): treat loff_t as unsigned */ #define FMODE_UNSIGNED_OFFSET ((__force fmode_t)0x2000) /* File is opened with O_PATH; almost nothing can be done with it */ #define FMODE_PATH ((__force fmode_t)0x4000) /* File needs atomic accesses to f_pos */ #define FMODE_ATOMIC_POS ((__force fmode_t)0x8000) /* Write access to underlying fs */ #define FMODE_WRITER ((__force fmode_t)0x10000) /* Has read method(s) */ #define FMODE_CAN_READ ((__force fmode_t)0x20000) /* Has write method(s) */ #define FMODE_CAN_WRITE ((__force fmode_t)0x40000) #define FMODE_OPENED ((__force fmode_t)0x80000) #define FMODE_CREATED ((__force fmode_t)0x100000) /* File is stream-like */ #define FMODE_STREAM ((__force fmode_t)0x200000) /* File was opened by fanotify and shouldn't generate fanotify events */ #define FMODE_NONOTIFY ((__force fmode_t)0x4000000) /* File is capable of returning -EAGAIN if I/O will block */ #define FMODE_NOWAIT ((__force fmode_t)0x8000000) /* File represents mount that needs unmounting */ #define FMODE_NEED_UNMOUNT ((__force fmode_t)0x10000000) /* File does not contribute to nr_files count */ #define FMODE_NOACCOUNT ((__force fmode_t)0x20000000) /* File supports async buffered reads */ #define FMODE_BUF_RASYNC ((__force fmode_t)0x40000000) /* * Attribute flags. These should be or-ed together to figure out what * has been changed! */ #define ATTR_MODE (1 << 0) #define ATTR_UID (1 << 1) #define ATTR_GID (1 << 2) #define ATTR_SIZE (1 << 3) #define ATTR_ATIME (1 << 4) #define ATTR_MTIME (1 << 5) #define ATTR_CTIME (1 << 6) #define ATTR_ATIME_SET (1 << 7) #define ATTR_MTIME_SET (1 << 8) #define ATTR_FORCE (1 << 9) /* Not a change, but a change it */ #define ATTR_KILL_SUID (1 << 11) #define ATTR_KILL_SGID (1 << 12) #define ATTR_FILE (1 << 13) #define ATTR_KILL_PRIV (1 << 14) #define ATTR_OPEN (1 << 15) /* Truncating from open(O_TRUNC) */ #define ATTR_TIMES_SET (1 << 16) #define ATTR_TOUCH (1 << 17) /* * Whiteout is represented by a char device. The following constants define the * mode and device number to use. */ #define WHITEOUT_MODE 0 #define WHITEOUT_DEV 0 /* * This is the Inode Attributes structure, used for notify_change(). It * uses the above definitions as flags, to know which values have changed. * Also, in this manner, a Filesystem can look at only the values it cares * about. Basically, these are the attributes that the VFS layer can * request to change from the FS layer. * * Derek Atkins <warlord@MIT.EDU> 94-10-20 */ struct iattr { unsigned int ia_valid; umode_t ia_mode; kuid_t ia_uid; kgid_t ia_gid; loff_t ia_size; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; /* * Not an attribute, but an auxiliary info for filesystems wanting to * implement an ftruncate() like method. NOTE: filesystem should * check for (ia_valid & ATTR_FILE), and not for (ia_file != NULL). */ struct file *ia_file; }; /* * Includes for diskquotas. */ #include <linux/quota.h> /* * Maximum number of layers of fs stack. Needs to be limited to * prevent kernel stack overflow */ #define FILESYSTEM_MAX_STACK_DEPTH 2 /** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has * completed, that the page is still locked, and * should be considered active. The VM uses this hint * to return the page to the active list -- it won't * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. * The caller should back up to acquiring a new page and * trying again. The aop will be taking reasonable * precautions not to livelock. If the caller held a page * reference, it should drop it before retrying. Returned * by readpage(). * * address_space_operation functions return these large constants to indicate * special semantics to the caller. These are much larger than the bytes in a * page to allow for functions that return the number of bytes operated on in a * given page. */ enum positive_aop_returns { AOP_WRITEPAGE_ACTIVATE = 0x80000, AOP_TRUNCATED_PAGE = 0x80001, }; #define AOP_FLAG_CONT_EXPAND 0x0001 /* called from cont_expand */ #define AOP_FLAG_NOFS 0x0002 /* used by filesystem to direct * helper code (eg buffer layer) * to clear GFP_FS from alloc */ /* * oh the beauties of C type declarations. */ struct page; struct address_space; struct writeback_control; struct readahead_control; /* * Write life time hint values. * Stored in struct inode as u8. */ enum rw_hint { WRITE_LIFE_NOT_SET = 0, WRITE_LIFE_NONE = RWH_WRITE_LIFE_NONE, WRITE_LIFE_SHORT = RWH_WRITE_LIFE_SHORT, WRITE_LIFE_MEDIUM = RWH_WRITE_LIFE_MEDIUM, WRITE_LIFE_LONG = RWH_WRITE_LIFE_LONG, WRITE_LIFE_EXTREME = RWH_WRITE_LIFE_EXTREME, }; /* Match RWF_* bits to IOCB bits */ #define IOCB_HIPRI (__force int) RWF_HIPRI #define IOCB_DSYNC (__force int) RWF_DSYNC #define IOCB_SYNC (__force int) RWF_SYNC #define IOCB_NOWAIT (__force int) RWF_NOWAIT #define IOCB_APPEND (__force int) RWF_APPEND /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) #define IOCB_DIRECT (1 << 17) #define IOCB_WRITE (1 << 18) /* iocb->ki_waitq is valid */ #define IOCB_WAITQ (1 << 19) #define IOCB_NOIO (1 << 20) struct kiocb { struct file *ki_filp; /* The 'ki_filp' pointer is shared in a union for aio */ randomized_struct_fields_start loff_t ki_pos; void (*ki_complete)(struct kiocb *iocb, long ret, long ret2); void *private; int ki_flags; u16 ki_hint; u16 ki_ioprio; /* See linux/ioprio.h */ union { unsigned int ki_cookie; /* for ->iopoll */ struct wait_page_queue *ki_waitq; /* for async buffered IO */ }; randomized_struct_fields_end }; static inline bool is_sync_kiocb(struct kiocb *kiocb) { return kiocb->ki_complete == NULL; } /* * "descriptor" for what we're up to with a read. * This allows us to use the same read code yet * have multiple different users of the data that * we read from a file. * * The simplest case just copies the data to user * mode. */ typedef struct { size_t written; size_t count; union { char __user *buf; void *data; } arg; int error; } read_descriptor_t; typedef int (*read_actor_t)(read_descriptor_t *, struct page *, unsigned long, unsigned long); struct address_space_operations { int (*writepage)(struct page *page, struct writeback_control *wbc); int (*readpage)(struct file *, struct page *); /* Write back some dirty pages from this mapping. */ int (*writepages)(struct address_space *, struct writeback_control *); /* Set a page dirty. Return true if this dirtied it */ int (*set_page_dirty)(struct page *page); /* * Reads in the requested pages. Unlike ->readpage(), this is * PURELY used for read-ahead!. */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); int (*write_end)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); /* Unfortunately this kludge is needed for FIBMAP. Don't use it */ sector_t (*bmap)(struct address_space *, sector_t); void (*invalidatepage) (struct page *, unsigned int, unsigned int); int (*releasepage) (struct page *, gfp_t); void (*freepage)(struct page *); ssize_t (*direct_IO)(struct kiocb *, struct iov_iter *iter); /* * migrate the contents of a page to the specified target. If * migrate_mode is MIGRATE_ASYNC, it must not block. */ int (*migratepage) (struct address_space *, struct page *, struct page *, enum migrate_mode); bool (*isolate_page)(struct page *, isolate_mode_t); void (*putback_page)(struct page *); int (*launder_page) (struct page *); int (*is_partially_uptodate) (struct page *, unsigned long, unsigned long); void (*is_dirty_writeback) (struct page *, bool *, bool *); int (*error_remove_page)(struct address_space *, struct page *); /* swapfile support */ int (*swap_activate)(struct swap_info_struct *sis, struct file *file, sector_t *span); void (*swap_deactivate)(struct file *file); }; extern const struct address_space_operations empty_aops; /* * pagecache_write_begin/pagecache_write_end must be used by general code * to write into the pagecache. */ int pagecache_write_begin(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); int pagecache_write_end(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata); /** * struct address_space - Contents of a cacheable, mappable object. * @host: Owner, either the inode or the block_device. * @i_pages: Cached pages. * @gfp_mask: Memory allocation flags to use for allocating pages. * @i_mmap_writable: Number of VM_SHARED mappings. * @nr_thps: Number of THPs in the pagecache (non-shmem only). * @i_mmap: Tree of private and shared mappings. * @i_mmap_rwsem: Protects @i_mmap and @i_mmap_writable. * @nrpages: Number of page entries, protected by the i_pages lock. * @nrexceptional: Shadow or DAX entries, protected by the i_pages lock. * @writeback_index: Writeback starts here. * @a_ops: Methods. * @flags: Error bits and flags (AS_*). * @wb_err: The most recent error which has occurred. * @private_lock: For use by the owner of the address_space. * @private_list: For use by the owner of the address_space. * @private_data: For use by the owner of the address_space. */ struct address_space { struct inode *host; struct xarray i_pages; gfp_t gfp_mask; atomic_t i_mmap_writable; #ifdef CONFIG_READ_ONLY_THP_FOR_FS /* number of thp, only for non-shmem files */ atomic_t nr_thps; #endif struct rb_root_cached i_mmap; struct rw_semaphore i_mmap_rwsem; unsigned long nrpages; unsigned long nrexceptional; pgoff_t writeback_index; const struct address_space_operations *a_ops; unsigned long flags; errseq_t wb_err; spinlock_t private_lock; struct list_head private_list; void *private_data; } __attribute__((aligned(sizeof(long)))) __randomize_layout; /* * On most architectures that alignment is already the case; but * must be enforced here for CRIS, to let the least significant bit * of struct page's "mapping" pointer be used for PAGE_MAPPING_ANON. */ /* XArray tags, for tagging dirty and writeback pages in the pagecache. */ #define PAGECACHE_TAG_DIRTY XA_MARK_0 #define PAGECACHE_TAG_WRITEBACK XA_MARK_1 #define PAGECACHE_TAG_TOWRITE XA_MARK_2 /* * Returns true if any of the pages in the mapping are marked with the tag. */ static inline bool mapping_tagged(struct address_space *mapping, xa_mark_t tag) { return xa_marked(&mapping->i_pages, tag); } static inline void i_mmap_lock_write(struct address_space *mapping) { down_write(&mapping->i_mmap_rwsem); } static inline int i_mmap_trylock_write(struct address_space *mapping) { return down_write_trylock(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_write(struct address_space *mapping) { up_write(&mapping->i_mmap_rwsem); } static inline void i_mmap_lock_read(struct address_space *mapping) { down_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_unlock_read(struct address_space *mapping) { up_read(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_locked(struct address_space *mapping) { lockdep_assert_held(&mapping->i_mmap_rwsem); } static inline void i_mmap_assert_write_locked(struct address_space *mapping) { lockdep_assert_held_write(&mapping->i_mmap_rwsem); } /* * Might pages of this file be mapped into userspace? */ static inline int mapping_mapped(struct address_space *mapping) { return !RB_EMPTY_ROOT(&mapping->i_mmap.rb_root); } /* * Might pages of this file have been modified in userspace? * Note that i_mmap_writable counts all VM_SHARED vmas: do_mmap * marks vma as VM_SHARED if it is shared, and the file was opened for * writing i.e. vma may be mprotected writable even if now readonly. * * If i_mmap_writable is negative, no new writable mappings are allowed. You * can only deny writable mappings, if none exists right now. */ static inline int mapping_writably_mapped(struct address_space *mapping) { return atomic_read(&mapping->i_mmap_writable) > 0; } static inline int mapping_map_writable(struct address_space *mapping) { return atomic_inc_unless_negative(&mapping->i_mmap_writable) ? 0 : -EPERM; } static inline void mapping_unmap_writable(struct address_space *mapping) { atomic_dec(&mapping->i_mmap_writable); } static inline int mapping_deny_writable(struct address_space *mapping) { return atomic_dec_unless_positive(&mapping->i_mmap_writable) ? 0 : -EBUSY; } static inline void mapping_allow_writable(struct address_space *mapping) { atomic_inc(&mapping->i_mmap_writable); } /* * Use sequence counter to get consistent i_size on 32-bit processors. */ #if BITS_PER_LONG==32 && defined(CONFIG_SMP) #include <linux/seqlock.h> #define __NEED_I_SIZE_ORDERED #define i_size_ordered_init(inode) seqcount_init(&inode->i_size_seqcount) #else #define i_size_ordered_init(inode) do { } while (0) #endif struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) #define ACL_DONT_CACHE ((void *)(-3)) static inline struct posix_acl * uncached_acl_sentinel(struct task_struct *task) { return (void *)task + 1; } static inline bool is_uncached_acl(struct posix_acl *acl) { return (long)acl & 1; } #define IOP_FASTPERM 0x0001 #define IOP_LOOKUP 0x0002 #define IOP_NOFOLLOW 0x0004 #define IOP_XATTR 0x0008 #define IOP_DEFAULT_READLINK 0x0010 struct fsnotify_mark_connector; /* * Keep mostly read-only and often accessed (especially for * the RCU path lookup and 'stat' data) fields at the beginning * of the 'struct inode' */ struct inode { umode_t i_mode; unsigned short i_opflags; kuid_t i_uid; kgid_t i_gid; unsigned int i_flags; #ifdef CONFIG_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif const struct inode_operations *i_op; struct super_block *i_sb; struct address_space *i_mapping; #ifdef CONFIG_SECURITY void *i_security; #endif /* Stat data, not accessed from path walking */ unsigned long i_ino; /* * Filesystems may only read i_nlink directly. They shall use the * following functions for modification: * * (set|clear|inc|drop)_nlink * inode_(inc|dec)_link_count */ union { const unsigned int i_nlink; unsigned int __i_nlink; }; dev_t i_rdev; loff_t i_size; struct timespec64 i_atime; struct timespec64 i_mtime; struct timespec64 i_ctime; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; u8 i_blkbits; u8 i_write_hint; blkcnt_t i_blocks; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; #endif /* Misc */ unsigned long i_state; struct rw_semaphore i_rwsem; unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_time_when; struct hlist_node i_hash; struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ /* foreign inode detection, see wbc_detach_inode() */ int i_wb_frn_winner; u16 i_wb_frn_avg_time; u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; struct rcu_head i_rcu; }; atomic64_t i_version; atomic64_t i_sequence; /* see futex */ atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); }; struct file_lock_context *i_flctx; struct address_space i_data; struct list_head i_devices; union { struct pipe_inode_info *i_pipe; struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; unsigned i_dir_seq; }; __u32 i_generation; #ifdef CONFIG_FSNOTIFY __u32 i_fsnotify_mask; /* all events this inode cares about */ struct fsnotify_mark_connector __rcu *i_fsnotify_marks; #endif #ifdef CONFIG_FS_ENCRYPTION struct fscrypt_info *i_crypt_info; #endif #ifdef CONFIG_FS_VERITY struct fsverity_info *i_verity_info; #endif void *i_private; /* fs or device private pointer */ } __randomize_layout; struct timespec64 timestamp_truncate(struct timespec64 t, struct inode *inode); static inline unsigned int i_blocksize(const struct inode *node) { return (1 << node->i_blkbits); } static inline int inode_unhashed(struct inode *inode) { return hlist_unhashed(&inode->i_hash); } /* * __mark_inode_dirty expects inodes to be hashed. Since we don't * want special inodes in the fileset inode space, we make them * appear hashed, but do not put on any lists. hlist_del() * will work fine and require no locking. */ static inline void inode_fake_hash(struct inode *inode) { hlist_add_fake(&inode->i_hash); } /* * inode->i_mutex nesting subclasses for the lock validator: * * 0: the object of the current VFS operation * 1: parent * 2: child/target * 3: xattr * 4: second non-directory * 5: second parent (when locking independent directories in rename) * * I_MUTEX_NONDIR2 is for certain operations (such as rename) which lock two * non-directories at once. * * The locking order between these classes is * parent[2] -> child -> grandchild -> normal -> xattr -> second non-directory */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, I_MUTEX_XATTR, I_MUTEX_NONDIR2, I_MUTEX_PARENT2, }; static inline void inode_lock(struct inode *inode) { down_write(&inode->i_rwsem); } static inline void inode_unlock(struct inode *inode) { up_write(&inode->i_rwsem); } static inline void inode_lock_shared(struct inode *inode) { down_read(&inode->i_rwsem); } static inline void inode_unlock_shared(struct inode *inode) { up_read(&inode->i_rwsem); } static inline int inode_trylock(struct inode *inode) { return down_write_trylock(&inode->i_rwsem); } static inline int inode_trylock_shared(struct inode *inode) { return down_read_trylock(&inode->i_rwsem); } static inline int inode_is_locked(struct inode *inode) { return rwsem_is_locked(&inode->i_rwsem); } static inline void inode_lock_nested(struct inode *inode, unsigned subclass) { down_write_nested(&inode->i_rwsem, subclass); } static inline void inode_lock_shared_nested(struct inode *inode, unsigned subclass) { down_read_nested(&inode->i_rwsem, subclass); } void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); /* * NOTE: in a 32bit arch with a preemptable kernel and * an UP compile the i_size_read/write must be atomic * with respect to the local cpu (unlike with preempt disabled), * but they don't need to be atomic with respect to other cpus like in * true SMP (so they need either to either locally disable irq around * the read or for example on x86 they can be still implemented as a * cmpxchg8b without the need of the lock prefix). For SMP compiles * and 64bit archs it makes no difference if preempt is enabled or not. */ static inline loff_t i_size_read(const struct inode *inode) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) loff_t i_size; unsigned int seq; do { seq = read_seqcount_begin(&inode->i_size_seqcount); i_size = inode->i_size; } while (read_seqcount_retry(&inode->i_size_seqcount, seq)); return i_size; #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) loff_t i_size; preempt_disable(); i_size = inode->i_size; preempt_enable(); return i_size; #else return inode->i_size; #endif } /* * NOTE: unlike i_size_read(), i_size_write() does need locking around it * (normally i_mutex), otherwise on 32bit/SMP an update of i_size_seqcount * can be lost, resulting in subsequent i_size_read() calls spinning forever. */ static inline void i_size_write(struct inode *inode, loff_t i_size) { #if BITS_PER_LONG==32 && defined(CONFIG_SMP) preempt_disable(); write_seqcount_begin(&inode->i_size_seqcount); inode->i_size = i_size; write_seqcount_end(&inode->i_size_seqcount); preempt_enable(); #elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) preempt_disable(); inode->i_size = i_size; preempt_enable(); #else inode->i_size = i_size; #endif } static inline unsigned iminor(const struct inode *inode) { return MINOR(inode->i_rdev); } static inline unsigned imajor(const struct inode *inode) { return MAJOR(inode->i_rdev); } struct fown_struct { rwlock_t lock; /* protects pid, uid, euid fields */ struct pid *pid; /* pid or -pgrp where SIGIO should be sent */ enum pid_type pid_type; /* Kind of process group SIGIO should be sent to */ kuid_t uid, euid; /* uid/euid of process setting the owner */ int signum; /* posix.1b rt signal to be delivered on IO */ }; /* * Track a single file's readahead state */ struct file_ra_state { pgoff_t start; /* where readahead started */ unsigned int size; /* # of readahead pages */ unsigned int async_size; /* do asynchronous readahead when there are only # of pages ahead */ unsigned int ra_pages; /* Maximum readahead window */ unsigned int mmap_miss; /* Cache miss stat for mmap accesses */ loff_t prev_pos; /* Cache last read() position */ }; /* * Check if @index falls in the readahead windows. */ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) { return (index >= ra->start && index < ra->start + ra->size); } struct file { union { struct llist_node fu_llist; struct rcu_head fu_rcuhead; } f_u; struct path f_path; struct inode *f_inode; /* cached value */ const struct file_operations *f_op; /* * Protects f_ep_links, f_flags. * Must not be taken from IRQ context. */ spinlock_t f_lock; enum rw_hint f_write_hint; atomic_long_t f_count; unsigned int f_flags; fmode_t f_mode; struct mutex f_pos_lock; loff_t f_pos; struct fown_struct f_owner; const struct cred *f_cred; struct file_ra_state f_ra; u64 f_version; #ifdef CONFIG_SECURITY void *f_security; #endif /* needed for tty driver, and maybe others */ void *private_data; #ifdef CONFIG_EPOLL /* Used by fs/eventpoll.c to link all the hooks to this file */ struct list_head f_ep_links; struct list_head f_tfile_llink; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; errseq_t f_wb_err; errseq_t f_sb_err; /* for syncfs */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ struct file_handle { __u32 handle_bytes; int handle_type; /* file identifier */ unsigned char f_handle[]; }; static inline struct file *get_file(struct file *f) { atomic_long_inc(&f->f_count); return f; } #define get_file_rcu_many(x, cnt) \ atomic_long_add_unless(&(x)->f_count, (cnt), 0) #define get_file_rcu(x) get_file_rcu_many((x), 1) #define file_count(x) atomic_long_read(&(x)->f_count) #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 #define MAX_LFS_FILESIZE ((loff_t)ULONG_MAX << PAGE_SHIFT) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE ((loff_t)LLONG_MAX) #endif #define FL_POSIX 1 #define FL_FLOCK 2 #define FL_DELEG 4 /* NFSv4 delegation */ #define FL_ACCESS 8 /* not trying to lock, just looking */ #define FL_EXISTS 16 /* when unlocking, test for existence */ #define FL_LEASE 32 /* lease held on this file */ #define FL_CLOSE 64 /* unlock on close */ #define FL_SLEEP 128 /* A blocking lock */ #define FL_DOWNGRADE_PENDING 256 /* Lease is being downgraded */ #define FL_UNLOCK_PENDING 512 /* Lease is being broken */ #define FL_OFDLCK 1024 /* lock is "owned" by struct file */ #define FL_LAYOUT 2048 /* outstanding pNFS layout */ #define FL_CLOSE_POSIX (FL_POSIX | FL_CLOSE) /* * Special return value from posix_lock_file() and vfs_lock_file() for * asynchronous locking. */ #define FILE_LOCK_DEFERRED 1 /* legacy typedef, should eventually be removed */ typedef void *fl_owner_t; struct file_lock; struct file_lock_operations { void (*fl_copy_lock)(struct file_lock *, struct file_lock *); void (*fl_release_private)(struct file_lock *); }; struct lock_manager_operations { fl_owner_t (*lm_get_owner)(fl_owner_t); void (*lm_put_owner)(fl_owner_t); void (*lm_notify)(struct file_lock *); /* unblock callback */ int (*lm_grant)(struct file_lock *, int); bool (*lm_break)(struct file_lock *); int (*lm_change)(struct file_lock *, int, struct list_head *); void (*lm_setup)(struct file_lock *, void **); bool (*lm_breaker_owns_lease)(struct file_lock *); }; struct lock_manager { struct list_head list; /* * NFSv4 and up also want opens blocked during the grace period; * NLM doesn't care: */ bool block_opens; }; struct net; void locks_start_grace(struct net *, struct lock_manager *); void locks_end_grace(struct lock_manager *); bool locks_in_grace(struct net *); bool opens_in_grace(struct net *); /* that will die - we need it for nfs_lock_info */ #include <linux/nfs_fs_i.h> /* * struct file_lock represents a generic "file lock". It's used to represent * POSIX byte range locks, BSD (flock) locks, and leases. It's important to * note that the same struct is used to represent both a request for a lock and * the lock itself, but the same object is never used for both. * * FIXME: should we create a separate "struct lock_request" to help distinguish * these two uses? * * The varous i_flctx lists are ordered by: * * 1) lock owner * 2) lock range start * 3) lock range end * * Obviously, the last two criteria only matter for POSIX locks. */ struct file_lock { struct file_lock *fl_blocker; /* The lock, that is blocking us */ struct list_head fl_list; /* link into file_lock_context */ struct hlist_node fl_link; /* node in global lists */ struct list_head fl_blocked_requests; /* list of requests with * ->fl_blocker pointing here */ struct list_head fl_blocked_member; /* node in * ->fl_blocker->fl_blocked_requests */ fl_owner_t fl_owner; unsigned int fl_flags; unsigned char fl_type; unsigned int fl_pid; int fl_link_cpu; /* what cpu's list is this on? */ wait_queue_head_t fl_wait; struct file *fl_file; loff_t fl_start; loff_t fl_end; struct fasync_struct * fl_fasync; /* for lease break notifications */ /* for lease breaks: */ unsigned long fl_break_time; unsigned long fl_downgrade_time; const struct file_lock_operations *fl_ops; /* Callbacks for filesystems */ const struct lock_manager_operations *fl_lmops; /* Callbacks for lockmanagers */ union { struct nfs_lock_info nfs_fl; struct nfs4_lock_info nfs4_fl; struct { struct list_head link; /* link in AFS vnode's pending_locks list */ int state; /* state of grant or error if -ve */ unsigned int debug_id; } afs; } fl_u; } __randomize_layout; struct file_lock_context { spinlock_t flc_lock; struct list_head flc_flock; struct list_head flc_posix; struct list_head flc_lease; }; /* The following constant reflects the upper bound of the file/locking space */ #ifndef OFFSET_MAX #define INT_LIMIT(x) (~((x)1 << (sizeof(x)*8 - 1))) #define OFFSET_MAX INT_LIMIT(loff_t) #define OFFT_OFFSET_MAX INT_LIMIT(off_t) #endif extern void send_sigio(struct fown_struct *fown, int fd, int band); #define locks_inode(f) file_inode(f) #ifdef CONFIG_FILE_LOCKING extern int fcntl_getlk(struct file *, unsigned int, struct flock *); extern int fcntl_setlk(unsigned int, struct file *, unsigned int, struct flock *); #if BITS_PER_LONG == 32 extern int fcntl_getlk64(struct file *, unsigned int, struct flock64 *); extern int fcntl_setlk64(unsigned int, struct file *, unsigned int, struct flock64 *); #endif extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); extern int fcntl_getlease(struct file *filp); /* fs/locks.c */ void locks_free_lock_context(struct inode *inode); void locks_free_lock(struct file_lock *fl); extern void locks_init_lock(struct file_lock *); extern struct file_lock * locks_alloc_lock(void); extern void locks_copy_lock(struct file_lock *, struct file_lock *); extern void locks_copy_conflock(struct file_lock *, struct file_lock *); extern void locks_remove_posix(struct file *, fl_owner_t); extern void locks_remove_file(struct file *); extern void locks_release_private(struct file_lock *); extern void posix_test_lock(struct file *, struct file_lock *); extern int posix_lock_file(struct file *, struct file_lock *, struct file_lock *); extern int locks_delete_block(struct file_lock *); extern int vfs_test_lock(struct file *, struct file_lock *); extern int vfs_lock_file(struct file *, unsigned int, struct file_lock *, struct file_lock *); extern int vfs_cancel_lock(struct file *filp, struct file_lock *fl); extern int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl); extern int __break_lease(struct inode *inode, unsigned int flags, unsigned int type); extern void lease_get_mtime(struct inode *, struct timespec64 *time); extern int generic_setlease(struct file *, long, struct file_lock **, void **priv); extern int vfs_setlease(struct file *, long, struct file_lock **, void **); extern int lease_modify(struct file_lock *, int, struct list_head *); struct notifier_block; extern int lease_register_notifier(struct notifier_block *); extern void lease_unregister_notifier(struct notifier_block *); struct files_struct; extern void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files); #else /* !CONFIG_FILE_LOCKING */ static inline int fcntl_getlk(struct file *file, unsigned int cmd, struct flock __user *user) { return -EINVAL; } static inline int fcntl_setlk(unsigned int fd, struct file *file, unsigned int cmd, struct flock __user *user) { return -EACCES; } #if BITS_PER_LONG == 32 static inline int fcntl_getlk64(struct file *file, unsigned int cmd, struct flock64 __user *user) { return -EINVAL; } static inline int fcntl_setlk64(unsigned int fd, struct file *file, unsigned int cmd, struct flock64 __user *user) { return -EACCES; } #endif static inline int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { return -EINVAL; } static inline int fcntl_getlease(struct file *filp) { return F_UNLCK; } static inline void locks_free_lock_context(struct inode *inode) { } static inline void locks_init_lock(struct file_lock *fl) { return; } static inline void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { return; } static inline void locks_remove_posix(struct file *filp, fl_owner_t owner) { return; } static inline void locks_remove_file(struct file *filp) { return; } static inline void posix_test_lock(struct file *filp, struct file_lock *fl) { return; } static inline int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { return -ENOLCK; } static inline int locks_delete_block(struct file_lock *waiter) { return -ENOENT; } static inline int vfs_test_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int vfs_lock_file(struct file *filp, unsigned int cmd, struct file_lock *fl, struct file_lock *conf) { return -ENOLCK; } static inline int vfs_cancel_lock(struct file *filp, struct file_lock *fl) { return 0; } static inline int locks_lock_inode_wait(struct inode *inode, struct file_lock *fl) { return -ENOLCK; } static inline int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { return 0; } static inline void lease_get_mtime(struct inode *inode, struct timespec64 *time) { return; } static inline int generic_setlease(struct file *filp, long arg, struct file_lock **flp, void **priv) { return -EINVAL; } static inline int vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { return -EINVAL; } static inline int lease_modify(struct file_lock *fl, int arg, struct list_head *dispose) { return -EINVAL; } struct files_struct; static inline void show_fd_locks(struct seq_file *f, struct file *filp, struct files_struct *files) {} #endif /* !CONFIG_FILE_LOCKING */ static inline struct inode *file_inode(const struct file *f) { return f->f_inode; } static inline struct dentry *file_dentry(const struct file *file) { return d_real(file->f_path.dentry, file_inode(file)); } static inline int locks_lock_file_wait(struct file *filp, struct file_lock *fl) { return locks_lock_inode_wait(locks_inode(filp), fl); } struct fasync_struct { rwlock_t fa_lock; int magic; int fa_fd; struct fasync_struct *fa_next; /* singly linked list */ struct file *fa_file; struct rcu_head fa_rcu; }; #define FASYNC_MAGIC 0x4601 /* SMP safe fasync helpers: */ extern int fasync_helper(int, struct file *, int, struct fasync_struct **); extern struct fasync_struct *fasync_insert_entry(int, struct file *, struct fasync_struct **, struct fasync_struct *); extern int fasync_remove_entry(struct file *, struct fasync_struct **); extern struct fasync_struct *fasync_alloc(void); extern void fasync_free(struct fasync_struct *); /* can be called from interrupts */ extern void kill_fasync(struct fasync_struct **, int, int); extern void __f_setown(struct file *filp, struct pid *, enum pid_type, int force); extern int f_setown(struct file *filp, unsigned long arg, int force); extern void f_delown(struct file *filp); extern pid_t f_getown(struct file *filp); extern int send_sigurg(struct fown_struct *fown); /* * sb->s_flags. Note that these mirror the equivalent MS_* flags where * represented in both. */ #define SB_RDONLY 1 /* Mount read-only */ #define SB_NOSUID 2 /* Ignore suid and sgid bits */ #define SB_NODEV 4 /* Disallow access to device special files */ #define SB_NOEXEC 8 /* Disallow program execution */ #define SB_SYNCHRONOUS 16 /* Writes are synced at once */ #define SB_MANDLOCK 64 /* Allow mandatory locks on an FS */ #define SB_DIRSYNC 128 /* Directory modifications are synchronous */ #define SB_NOATIME 1024 /* Do not update access times. */ #define SB_NODIRATIME 2048 /* Do not update directory access times */ #define SB_SILENT 32768 #define SB_POSIXACL (1<<16) /* VFS does not apply the umask */ #define SB_INLINECRYPT (1<<17) /* Use blk-crypto for encrypted files */ #define SB_KERNMOUNT (1<<22) /* this is a kern_mount call */ #define SB_I_VERSION (1<<23) /* Update inode I_version field */ #define SB_LAZYTIME (1<<25) /* Update the on-disk [acm]times lazily */ /* These sb flags are internal to the kernel */ #define SB_SUBMOUNT (1<<26) #define SB_FORCE (1<<27) #define SB_NOSEC (1<<28) #define SB_BORN (1<<29) #define SB_ACTIVE (1<<30) #define SB_NOUSER (1<<31) /* These flags relate to encoding and casefolding */ #define SB_ENC_STRICT_MODE_FL (1 << 0) #define sb_has_strict_encoding(sb) \ (sb->s_encoding_flags & SB_ENC_STRICT_MODE_FL) /* * Umount options */ #define MNT_FORCE 0x00000001 /* Attempt to forcibily umount */ #define MNT_DETACH 0x00000002 /* Just detach from the tree */ #define MNT_EXPIRE 0x00000004 /* Mark for expiry */ #define UMOUNT_NOFOLLOW 0x00000008 /* Don't follow symlink on umount */ #define UMOUNT_UNUSED 0x80000000 /* Flag guaranteed to be unused */ /* sb->s_iflags */ #define SB_I_CGROUPWB 0x00000001 /* cgroup-aware writeback enabled */ #define SB_I_NOEXEC 0x00000002 /* Ignore executables on this fs */ #define SB_I_NODEV 0x00000004 /* Ignore devices on this fs */ #define SB_I_STABLE_WRITES 0x00000008 /* don't modify blks until WB is done */ /* sb->s_iflags to limit user namespace mounts */ #define SB_I_USERNS_VISIBLE 0x00000010 /* fstype already mounted */ #define SB_I_IMA_UNVERIFIABLE_SIGNATURE 0x00000020 #define SB_I_UNTRUSTED_MOUNTER 0x00000040 #define SB_I_SKIP_SYNC 0x00000100 /* Skip superblock at global sync */ /* Possible states of 'frozen' field */ enum { SB_UNFROZEN = 0, /* FS is unfrozen */ SB_FREEZE_WRITE = 1, /* Writes, dir ops, ioctls frozen */ SB_FREEZE_PAGEFAULT = 2, /* Page faults stopped as well */ SB_FREEZE_FS = 3, /* For internal FS use (e.g. to stop * internal threads if needed) */ SB_FREEZE_COMPLETE = 4, /* ->freeze_fs finished successfully */ }; #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { int frozen; /* Is sb frozen? */ wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { struct list_head s_list; /* Keep this first */ dev_t s_dev; /* search index; _not_ kdev_t */ unsigned char s_blocksize_bits; unsigned long s_blocksize; loff_t s_maxbytes; /* Max file size */ struct file_system_type *s_type; const struct super_operations *s_op; const struct dquot_operations *dq_op; const struct quotactl_ops *s_qcop; const struct export_operations *s_export_op; unsigned long s_flags; unsigned long s_iflags; /* internal SB_I_* flags */ unsigned long s_magic; struct dentry *s_root; struct rw_semaphore s_umount; int s_count; atomic_t s_active; #ifdef CONFIG_SECURITY void *s_security; #endif const struct xattr_handler **s_xattr; #ifdef CONFIG_FS_ENCRYPTION const struct fscrypt_operations *s_cop; struct key *s_master_keys; /* master crypto keys in use */ #endif #ifdef CONFIG_FS_VERITY const struct fsverity_operations *s_vop; #endif #ifdef CONFIG_UNICODE struct unicode_map *s_encoding; __u16 s_encoding_flags; #endif struct hlist_bl_head s_roots; /* alternate root dentries for NFS */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; struct backing_dev_info *s_bdi; struct mtd_info *s_mtd; struct hlist_node s_instances; unsigned int s_quota_types; /* Bitmask of supported quota types */ struct quota_info s_dquot; /* Diskquota specific options */ struct sb_writers s_writers; /* * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and * s_fsnotify_marks together for cache efficiency. They are frequently * accessed and rarely modified. */ void *s_fs_info; /* Filesystem private info */ /* Granularity of c/m/atime in ns (cannot be worse than a second) */ u32 s_time_gran; /* Time limits for c/m/atime in seconds */ time64_t s_time_min; time64_t s_time_max; #ifdef CONFIG_FSNOTIFY __u32 s_fsnotify_mask; struct fsnotify_mark_connector __rcu *s_fsnotify_marks; #endif char s_id[32]; /* Informational name */ uuid_t s_uuid; /* UUID */ unsigned int s_max_links; fmode_t s_mode; /* * The next field is for VFS *only*. No filesystems have any business * even looking at it. You had been warned. */ struct mutex s_vfs_rename_mutex; /* Kludge */ /* * Filesystem subtype. If non-empty the filesystem type field * in /proc/mounts will be "type.subtype" */ const char *s_subtype; const struct dentry_operations *s_d_op; /* default d_op for dentries */ /* * Saved pool identifier for cleancache (-1 means none) */ int cleancache_poolid; struct shrinker s_shrink; /* per-sb shrinker handle */ /* Number of inodes with nlink == 0 but still referenced */ atomic_long_t s_remove_count; /* Pending fsnotify inode refs */ atomic_long_t s_fsnotify_inode_refs; /* Being remounted read-only */ int s_readonly_remount; /* per-sb errseq_t for reporting writeback errors via syncfs */ errseq_t s_wb_err; /* AIO completions deferred from interrupt context */ struct workqueue_struct *s_dio_done_wq; struct hlist_head s_pins; /* * Owning user namespace and default context in which to * interpret filesystem uids, gids, quotas, device nodes, * xattrs and security labels. */ struct user_namespace *s_user_ns; /* * The list_lru structure is essentially just a pointer to a table * of per-node lru lists, each of which has its own spinlock. * There is no need to put them into separate cachelines. */ struct list_lru s_dentry_lru; struct list_lru s_inode_lru; struct rcu_head rcu; struct work_struct destroy_work; struct mutex s_sync_lock; /* sync serialisation lock */ /* * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; /* s_inode_list_lock protects s_inodes */ spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; struct list_head s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ } __randomize_layout; /* Helper functions so that in most cases filesystems will * not need to deal directly with kuid_t and kgid_t and can * instead deal with the raw numeric values that are stored * in the filesystem. */ static inline uid_t i_uid_read(const struct inode *inode) { return from_kuid(inode->i_sb->s_user_ns, inode->i_uid); } static inline gid_t i_gid_read(const struct inode *inode) { return from_kgid(inode->i_sb->s_user_ns, inode->i_gid); } static inline void i_uid_write(struct inode *inode, uid_t uid) { inode->i_uid = make_kuid(inode->i_sb->s_user_ns, uid); } static inline void i_gid_write(struct inode *inode, gid_t gid) { inode->i_gid = make_kgid(inode->i_sb->s_user_ns, gid); } extern struct timespec64 current_time(struct inode *inode); /* * Snapshotting support. */ /* * These are internal functions, please use sb_start_{write,pagefault,intwrite} * instead. */ static inline void __sb_end_write(struct super_block *sb, int level) { percpu_up_read(sb->s_writers.rw_sem + level-1); } static inline void __sb_start_write(struct super_block *sb, int level) { percpu_down_read(sb->s_writers.rw_sem + level - 1); } static inline bool __sb_start_write_trylock(struct super_block *sb, int level) { return percpu_down_read_trylock(sb->s_writers.rw_sem + level - 1); } #define __sb_writers_acquired(sb, lev) \ percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to * * Decrement number of writers to the filesystem. Wake up possible waiters * wanting to freeze the filesystem. */ static inline void sb_end_write(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_WRITE); } /** * sb_end_pagefault - drop write access to a superblock from a page fault * @sb: the super we wrote to * * Decrement number of processes handling write page fault to the filesystem. * Wake up possible waiters wanting to freeze the filesystem. */ static inline void sb_end_pagefault(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_PAGEFAULT); } /** * sb_end_intwrite - drop write access to a superblock for internal fs purposes * @sb: the super we wrote to * * Decrement fs-internal number of writers to the filesystem. Wake up possible * waiters wanting to freeze the filesystem. */ static inline void sb_end_intwrite(struct super_block *sb) { __sb_end_write(sb, SB_FREEZE_FS); } /** * sb_start_write - get write access to a superblock * @sb: the super we write to * * When a process wants to write data or metadata to a file system (i.e. dirty * a page or an inode), it should embed the operation in a sb_start_write() - * sb_end_write() pair to get exclusion against file system freezing. This * function increments number of writers preventing freezing. If the file * system is already frozen, the function waits until the file system is * thawed. * * Since freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. Generally, * freeze protection should be the outermost lock. In particular, we have: * * sb_start_write * -> i_mutex (write path, truncate, directory ops, ...) * -> s_umount (freeze_super, thaw_super) */ static inline void sb_start_write(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_WRITE); } static inline bool sb_start_write_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_WRITE); } /** * sb_start_pagefault - get write access to a superblock from a page fault * @sb: the super we write to * * When a process starts handling write page fault, it should embed the * operation into sb_start_pagefault() - sb_end_pagefault() pair to get * exclusion against file system freezing. This is needed since the page fault * is going to dirty a page. This function increments number of running page * faults preventing freezing. If the file system is already frozen, the * function waits until the file system is thawed. * * Since page fault freeze protection behaves as a lock, users have to preserve * ordering of freeze protection and other filesystem locks. It is advised to * put sb_start_pagefault() close to mmap_lock in lock ordering. Page fault * handling code implies lock dependency: * * mmap_lock * -> sb_start_pagefault */ static inline void sb_start_pagefault(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_PAGEFAULT); } /* * sb_start_intwrite - get write access to a superblock for internal fs purposes * @sb: the super we write to * * This is the third level of protection against filesystem freezing. It is * free for use by a filesystem. The only requirement is that it must rank * below sb_start_pagefault. * * For example filesystem can call sb_start_intwrite() when starting a * transaction which somewhat eases handling of freezing for internal sources * of filesystem changes (internal fs threads, discarding preallocation on file * close, etc.). */ static inline void sb_start_intwrite(struct super_block *sb) { __sb_start_write(sb, SB_FREEZE_FS); } static inline bool sb_start_intwrite_trylock(struct super_block *sb) { return __sb_start_write_trylock(sb, SB_FREEZE_FS); } extern bool inode_owner_or_capable(const struct inode *inode); /* * VFS helper functions.. */ extern int vfs_create(struct inode *, struct dentry *, umode_t, bool); extern int vfs_mkdir(struct inode *, struct dentry *, umode_t); extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t); extern int vfs_symlink(struct inode *, struct dentry *, const char *); extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **); extern int vfs_rmdir(struct inode *, struct dentry *); extern int vfs_unlink(struct inode *, struct dentry *, struct inode **); extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int); static inline int vfs_whiteout(struct inode *dir, struct dentry *dentry) { return vfs_mknod(dir, dentry, S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); } extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag); int vfs_mkobj(struct dentry *, umode_t, int (*f)(struct dentry *, umode_t, void *), void *); int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #else #define compat_ptr_ioctl NULL #endif /* * VFS file helper functions. */ extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); /* * This is the "filldir" function type, used by readdir() to let * the kernel specify what kind of dirent layout it wants to have. * This allows the kernel to read directories into kernel space or * to have different dirent layouts depending on the binary type. */ struct dir_context; typedef int (*filldir_t)(struct dir_context *, const char *, int, loff_t, u64, unsigned); struct dir_context { filldir_t actor; loff_t pos; }; /* * These flags let !MMU mmap() govern direct device mapping vs immediate * copying more easily for MAP_PRIVATE, especially for ROM filesystems. * * NOMMU_MAP_COPY: Copy can be mapped (MAP_PRIVATE) * NOMMU_MAP_DIRECT: Can be mapped directly (MAP_SHARED) * NOMMU_MAP_READ: Can be mapped for reading * NOMMU_MAP_WRITE: Can be mapped for writing * NOMMU_MAP_EXEC: Can be mapped for execution */ #define NOMMU_MAP_COPY 0x00000001 #define NOMMU_MAP_DIRECT 0x00000008 #define NOMMU_MAP_READ VM_MAYREAD #define NOMMU_MAP_WRITE VM_MAYWRITE #define NOMMU_MAP_EXEC VM_MAYEXEC #define NOMMU_VMFLAGS \ (NOMMU_MAP_READ | NOMMU_MAP_WRITE | NOMMU_MAP_EXEC) /* * These flags control the behavior of the remap_file_range function pointer. * If it is called with len == 0 that means "remap to end of source file". * See Documentation/filesystems/vfs.rst for more details about this call. * * REMAP_FILE_DEDUP: only remap if contents identical (i.e. deduplicate) * REMAP_FILE_CAN_SHORTEN: caller can handle a shortened request */ #define REMAP_FILE_DEDUP (1 << 0) #define REMAP_FILE_CAN_SHORTEN (1 << 1) /* * These flags signal that the caller is ok with altering various aspects of * the behavior of the remap operation. The changes must be made by the * implementation; the vfs remap helper functions can take advantage of them. * Flags in this category exist to preserve the quirky behavior of the hoisted * btrfs clone/dedupe ioctls. */ #define REMAP_FILE_ADVISORY (REMAP_FILE_CAN_SHORTEN) struct iov_iter; struct file_operations { struct module *owner; loff_t (*llseek) (struct file *, loff_t, int); ssize_t (*read) (struct file *, char __user *, size_t, loff_t *); ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *); ssize_t (*read_iter) (struct kiocb *, struct iov_iter *); ssize_t (*write_iter) (struct kiocb *, struct iov_iter *); int (*iopoll)(struct kiocb *kiocb, bool spin); int (*iterate) (struct file *, struct dir_context *); int (*iterate_shared) (struct file *, struct dir_context *); __poll_t (*poll) (struct file *, struct poll_table_struct *); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long); int (*mmap) (struct file *, struct vm_area_struct *); unsigned long mmap_supported_flags; int (*open) (struct inode *, struct file *); int (*flush) (struct file *, fl_owner_t id); int (*release) (struct inode *, struct file *); int (*fsync) (struct file *, loff_t, loff_t, int datasync); int (*fasync) (int, struct file *, int); int (*lock) (struct file *, int, struct file_lock *); ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int); unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); int (*flock) (struct file *, int, struct file_lock *); ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); int (*setlease)(struct file *, long, struct file_lock **, void **); long (*fallocate)(struct file *file, int mode, loff_t offset, loff_t len); void (*show_fdinfo)(struct seq_file *m, struct file *f); #ifndef CONFIG_MMU unsigned (*mmap_capabilities)(struct file *); #endif ssize_t (*copy_file_range)(struct file *, loff_t, struct file *, loff_t, size_t, unsigned int); loff_t (*remap_file_range)(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); int (*fadvise)(struct file *, loff_t, loff_t, int); } __randomize_layout; struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *); int (*permission) (struct inode *, int); struct posix_acl * (*get_acl)(struct inode *, int); int (*readlink) (struct dentry *, char __user *,int); int (*create) (struct inode *,struct dentry *, umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); int (*mkdir) (struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (const struct path *, struct kstat *, u32, unsigned int); ssize_t (*listxattr) (struct dentry *, char *, size_t); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); int (*update_time)(struct inode *, struct timespec64 *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct inode *, struct dentry *, umode_t); int (*set_acl)(struct inode *, struct posix_acl *, int); } ____cacheline_aligned; static inline ssize_t call_read_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { return file->f_op->read_iter(kio, iter); } static inline ssize_t call_write_iter(struct file *file, struct kiocb *kio, struct iov_iter *iter) { return file->f_op->write_iter(kio, iter); } static inline int call_mmap(struct file *file, struct vm_area_struct *vma) { return file->f_op->mmap(file, vma); } extern ssize_t vfs_read(struct file *, char __user *, size_t, loff_t *); extern ssize_t vfs_write(struct file *, const char __user *, size_t, loff_t *); extern ssize_t vfs_copy_file_range(struct file *, loff_t , struct file *, loff_t, size_t, unsigned int); extern ssize_t generic_copy_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, size_t len, unsigned int flags); extern int generic_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *count, unsigned int remap_flags); extern loff_t do_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern loff_t vfs_clone_file_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, unsigned int remap_flags); extern int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same); extern loff_t vfs_dedupe_file_range_one(struct file *src_file, loff_t src_pos, struct file *dst_file, loff_t dst_pos, loff_t len, unsigned int remap_flags); struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*free_inode)(struct inode *); void (*dirty_inode) (struct inode *, int flags); int (*write_inode) (struct inode *, struct writeback_control *wbc); int (*drop_inode) (struct inode *); void (*evict_inode) (struct inode *); void (*put_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); int (*freeze_super) (struct super_block *); int (*freeze_fs) (struct super_block *); int (*thaw_super) (struct super_block *); int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*umount_begin) (struct super_block *); int (*show_options)(struct seq_file *, struct dentry *); int (*show_devname)(struct seq_file *, struct dentry *); int (*show_path)(struct seq_file *, struct dentry *); int (*show_stats)(struct seq_file *, struct dentry *); #ifdef CONFIG_QUOTA ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); struct dquot **(*get_dquots)(struct inode *); #endif int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); long (*nr_cached_objects)(struct super_block *, struct shrink_control *); long (*free_cached_objects)(struct super_block *, struct shrink_control *); }; /* * Inode flags - they have no relation to superblock flags now */ #define S_SYNC (1 << 0) /* Writes are synced at once */ #define S_NOATIME (1 << 1) /* Do not update access times */ #define S_APPEND (1 << 2) /* Append-only file */ #define S_IMMUTABLE (1 << 3) /* Immutable file */ #define S_DEAD (1 << 4) /* removed, but still open directory */ #define S_NOQUOTA (1 << 5) /* Inode is not counted to quota */ #define S_DIRSYNC (1 << 6) /* Directory modifications are synchronous */ #define S_NOCMTIME (1 << 7) /* Do not update file c/mtime */ #define S_SWAPFILE (1 << 8) /* Do not truncate: swapon got its bmaps */ #define S_PRIVATE (1 << 9) /* Inode is fs-internal */ #define S_IMA (1 << 10) /* Inode has an associated IMA struct */ #define S_AUTOMOUNT (1 << 11) /* Automount/referral quasi-directory */ #define S_NOSEC (1 << 12) /* no suid or xattr security attributes */ #ifdef CONFIG_FS_DAX #define S_DAX (1 << 13) /* Direct Access, avoiding the page cache */ #else #define S_DAX 0 /* Make all the DAX code disappear */ #endif #define S_ENCRYPTED (1 << 14) /* Encrypted file (using fs/crypto/) */ #define S_CASEFOLD (1 << 15) /* Casefolded file */ #define S_VERITY (1 << 16)