1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/include/linux/clockchips.h * * This file contains the structure definitions for clockchips. * * If you are not a clockchip, or the time of day code, you should * not be including this file! */ #ifndef _LINUX_CLOCKCHIPS_H #define _LINUX_CLOCKCHIPS_H #ifdef CONFIG_GENERIC_CLOCKEVENTS # include <linux/clocksource.h> # include <linux/cpumask.h> # include <linux/ktime.h> # include <linux/notifier.h> struct clock_event_device; struct module; /* * Possible states of a clock event device. * * DETACHED: Device is not used by clockevents core. Initial state or can be * reached from SHUTDOWN. * SHUTDOWN: Device is powered-off. Can be reached from PERIODIC or ONESHOT. * PERIODIC: Device is programmed to generate events periodically. Can be * reached from DETACHED or SHUTDOWN. * ONESHOT: Device is programmed to generate event only once. Can be reached * from DETACHED or SHUTDOWN. * ONESHOT_STOPPED: Device was programmed in ONESHOT mode and is temporarily * stopped. */ enum clock_event_state { CLOCK_EVT_STATE_DETACHED, CLOCK_EVT_STATE_SHUTDOWN, CLOCK_EVT_STATE_PERIODIC, CLOCK_EVT_STATE_ONESHOT, CLOCK_EVT_STATE_ONESHOT_STOPPED, }; /* * Clock event features */ # define CLOCK_EVT_FEAT_PERIODIC 0x000001 # define CLOCK_EVT_FEAT_ONESHOT 0x000002 # define CLOCK_EVT_FEAT_KTIME 0x000004 /* * x86(64) specific (mis)features: * * - Clockevent source stops in C3 State and needs broadcast support. * - Local APIC timer is used as a dummy device. */ # define CLOCK_EVT_FEAT_C3STOP 0x000008 # define CLOCK_EVT_FEAT_DUMMY 0x000010 /* * Core shall set the interrupt affinity dynamically in broadcast mode */ # define CLOCK_EVT_FEAT_DYNIRQ 0x000020 # define CLOCK_EVT_FEAT_PERCPU 0x000040 /* * Clockevent device is based on a hrtimer for broadcast */ # define CLOCK_EVT_FEAT_HRTIMER 0x000080 /** * struct clock_event_device - clock event device descriptor * @event_handler: Assigned by the framework to be called by the low * level handler of the event source * @set_next_event: set next event function using a clocksource delta * @set_next_ktime: set next event function using a direct ktime value * @next_event: local storage for the next event in oneshot mode * @max_delta_ns: maximum delta value in ns * @min_delta_ns: minimum delta value in ns * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) * @state_use_accessors:current state of the device, assigned by the core code * @features: features * @retries: number of forced programming retries * @set_state_periodic: switch state to periodic * @set_state_oneshot: switch state to oneshot * @set_state_oneshot_stopped: switch state to oneshot_stopped * @set_state_shutdown: switch state to shutdown * @tick_resume: resume clkevt device * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration * @name: ptr to clock event name * @rating: variable to rate clock event devices * @irq: IRQ number (only for non CPU local devices) * @bound_on: Bound on CPU * @cpumask: cpumask to indicate for which CPUs this device works * @list: list head for the management code * @owner: module reference */ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; u32 mult; u32 shift; enum clock_event_state state_use_accessors; unsigned int features; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); int (*set_state_oneshot)(struct clock_event_device *); int (*set_state_oneshot_stopped)(struct clock_event_device *); int (*set_state_shutdown)(struct clock_event_device *); int (*tick_resume)(struct clock_event_device *); void (*broadcast)(const struct cpumask *mask); void (*suspend)(struct clock_event_device *); void (*resume)(struct clock_event_device *); unsigned long min_delta_ticks; unsigned long max_delta_ticks; const char *name; int rating; int irq; int bound_on; const struct cpumask *cpumask; struct list_head list; struct module *owner; } ____cacheline_aligned; /* Helpers to verify state of a clockevent device */ static inline bool clockevent_state_detached(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_DETACHED; } static inline bool clockevent_state_shutdown(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_SHUTDOWN; } static inline bool clockevent_state_periodic(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_PERIODIC; } static inline bool clockevent_state_oneshot(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT; } static inline bool clockevent_state_oneshot_stopped(struct clock_event_device *dev) { return dev->state_use_accessors == CLOCK_EVT_STATE_ONESHOT_STOPPED; } /* * Calculate a multiplication factor for scaled math, which is used to convert * nanoseconds based values to clock ticks: * * clock_ticks = (nanoseconds * factor) >> shift. * * div_sc is the rearranged equation to calculate a factor from a given clock * ticks / nanoseconds ratio: * * factor = (clock_ticks << shift) / nanoseconds */ static inline unsigned long div_sc(unsigned long ticks, unsigned long nsec, int shift) { u64 tmp = ((u64)ticks) << shift; do_div(tmp, nsec); return (unsigned long) tmp; } /* Clock event layer functions */ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); extern int clockevents_unbind_device(struct clock_event_device *ced, int cpu); extern void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta); extern int clockevents_update_freq(struct clock_event_device *ce, u32 freq); static inline void clockevents_calc_mult_shift(struct clock_event_device *ce, u32 freq, u32 maxsec) { return clocks_calc_mult_shift(&ce->mult, &ce->shift, NSEC_PER_SEC, freq, maxsec); } extern void clockevents_suspend(void); extern void clockevents_resume(void); # ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST # ifdef CONFIG_ARCH_HAS_TICK_BROADCAST extern void tick_broadcast(const struct cpumask *mask); # else # define tick_broadcast NULL # endif extern int tick_receive_broadcast(void); # endif # if defined(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) && defined(CONFIG_TICK_ONESHOT) extern void tick_setup_hrtimer_broadcast(void); extern int tick_check_broadcast_expired(void); # else static inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } # endif #else /* !CONFIG_GENERIC_CLOCKEVENTS: */ static inline void clockevents_suspend(void) { } static inline void clockevents_resume(void) { } static inline int tick_check_broadcast_expired(void) { return 0; } static inline void tick_setup_hrtimer_broadcast(void) { } #endif /* !CONFIG_GENERIC_CLOCKEVENTS */ #endif /* _LINUX_CLOCKCHIPS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 /* SPDX-License-Identifier: GPL-2.0 */ /* * RT Mutexes: blocking mutual exclusion locks with PI support * * started by Ingo Molnar and Thomas Gleixner: * * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> * * This file contains the private data structure and API definitions. */ #ifndef __KERNEL_RTMUTEX_COMMON_H #define __KERNEL_RTMUTEX_COMMON_H #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> /* * This is the control structure for tasks blocked on a rt_mutex, * which is allocated on the kernel stack on of the blocked task. * * @tree_entry: pi node to enqueue into the mutex waiters tree * @pi_tree_entry: pi node to enqueue into the mutex owner waiters tree * @task: task reference to the blocked task */ struct rt_mutex_waiter { struct rb_node tree_entry; struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; #ifdef CONFIG_DEBUG_RT_MUTEXES unsigned long ip; struct pid *deadlock_task_pid; struct rt_mutex *deadlock_lock; #endif int prio; u64 deadline; }; /* * Various helpers to access the waiters-tree: */ #ifdef CONFIG_RT_MUTEXES static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return !RB_EMPTY_ROOT(&lock->waiters.rb_root); } static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { struct rb_node *leftmost = rb_first_cached(&lock->waiters); struct rt_mutex_waiter *w = NULL; if (leftmost) { w = rb_entry(leftmost, struct rt_mutex_waiter, tree_entry); BUG_ON(w->lock != lock); } return w; } static inline int task_has_pi_waiters(struct task_struct *p) { return !RB_EMPTY_ROOT(&p->pi_waiters.rb_root); } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { return rb_entry(p->pi_waiters.rb_leftmost, struct rt_mutex_waiter, pi_tree_entry); } #else static inline int rt_mutex_has_waiters(struct rt_mutex *lock) { return false; } static inline struct rt_mutex_waiter * rt_mutex_top_waiter(struct rt_mutex *lock) { return NULL; } static inline int task_has_pi_waiters(struct task_struct *p) { return false; } static inline struct rt_mutex_waiter * task_top_pi_waiter(struct task_struct *p) { return NULL; } #endif /* * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } /* * Constants for rt mutex functions which have a selectable deadlock * detection. * * RT_MUTEX_MIN_CHAINWALK: Stops the lock chain walk when there are * no further PI adjustments to be made. * * RT_MUTEX_FULL_CHAINWALK: Invoke deadlock detection with a full * walk of the lock chain. */ enum rtmutex_chainwalk { RT_MUTEX_MIN_CHAINWALK, RT_MUTEX_FULL_CHAINWALK, }; /* * PI-futex support (proxy locking functions, etc.): */ extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter); extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter); extern int rt_mutex_futex_trylock(struct rt_mutex *l); extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, struct wake_q_head *wqh); extern void rt_mutex_postunlock(struct wake_q_head *wake_q); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" #else # include "rtmutex.h" #endif #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_KDEV_T_H #define _LINUX_KDEV_T_H #include <uapi/linux/kdev_t.h> #define MINORBITS 20 #define MINORMASK ((1U << MINORBITS) - 1) #define MAJOR(dev) ((unsigned int) ((dev) >> MINORBITS)) #define MINOR(dev) ((unsigned int) ((dev) & MINORMASK)) #define MKDEV(ma,mi) (((ma) << MINORBITS) | (mi)) #define print_dev_t(buffer, dev) \ sprintf((buffer), "%u:%u\n", MAJOR(dev), MINOR(dev)) #define format_dev_t(buffer, dev) \ ({ \ sprintf(buffer, "%u:%u", MAJOR(dev), MINOR(dev)); \ buffer; \ }) /* acceptable for old filesystems */ static __always_inline bool old_valid_dev(dev_t dev) { return MAJOR(dev) < 256 && MINOR(dev) < 256; } static __always_inline u16 old_encode_dev(dev_t dev) { return (MAJOR(dev) << 8) | MINOR(dev); } static __always_inline dev_t old_decode_dev(u16 val) { return MKDEV((val >> 8) & 255, val & 255); } static __always_inline u32 new_encode_dev(dev_t dev) { unsigned major = MAJOR(dev); unsigned minor = MINOR(dev); return (minor & 0xff) | (major << 8) | ((minor & ~0xff) << 12); } static __always_inline dev_t new_decode_dev(u32 dev) { unsigned major = (dev & 0xfff00) >> 8; unsigned minor = (dev & 0xff) | ((dev >> 12) & 0xfff00); return MKDEV(major, minor); } static __always_inline u64 huge_encode_dev(dev_t dev) { return new_encode_dev(dev); } static __always_inline dev_t huge_decode_dev(u64 dev) { return new_decode_dev(dev); } static __always_inline int sysv_valid_dev(dev_t dev) { return MAJOR(dev) < (1<<14) && MINOR(dev) < (1<<18); } static __always_inline u32 sysv_encode_dev(dev_t dev) { return MINOR(dev) | (MAJOR(dev) << 18); } static __always_inline unsigned sysv_major(u32 dev) { return (dev >> 18) & 0x3fff; } static __always_inline unsigned sysv_minor(u32 dev) { return dev & 0x3ffff; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PIPE_FS_I_H #define _LINUX_PIPE_FS_I_H #define PIPE_DEF_BUFFERS 16 #define PIPE_BUF_FLAG_LRU 0x01 /* page is on the LRU */ #define PIPE_BUF_FLAG_ATOMIC 0x02 /* was atomically mapped */ #define PIPE_BUF_FLAG_GIFT 0x04 /* page is a gift */ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ #define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */ #define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */ #ifdef CONFIG_WATCH_QUEUE #define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */ #endif /** * struct pipe_buffer - a linux kernel pipe buffer * @page: the page containing the data for the pipe buffer * @offset: offset of data inside the @page * @len: length of data inside the @page * @ops: operations associated with this buffer. See @pipe_buf_operations. * @flags: pipe buffer flags. See above. * @private: private data owned by the ops. **/ struct pipe_buffer { struct page *page; unsigned int offset, len; const struct pipe_buf_operations *ops; unsigned int flags; unsigned long private; }; /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe * @head: The point of buffer production * @tail: The point of buffer consumption * @note_loss: The next read() should insert a data-lost message * @max_usage: The maximum number of slots that may be used in the ring * @ring_size: total number of buffers (should be a power of 2) * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs * @tmp_page: cached released page * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe * @files: number of struct file referring this pipe (protected by ->i_lock) * @r_counter: reader counter * @w_counter: writer counter * @poll_usage: is this pipe used for epoll, which has crazy wakeups? * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers * @user: the user who created this pipe * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { struct mutex mutex; wait_queue_head_t rd_wait, wr_wait; unsigned int head; unsigned int tail; unsigned int max_usage; unsigned int ring_size; #ifdef CONFIG_WATCH_QUEUE bool note_loss; #endif unsigned int nr_accounted; unsigned int readers; unsigned int writers; unsigned int files; unsigned int r_counter; unsigned int w_counter; unsigned int poll_usage; struct page *tmp_page; struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; struct user_struct *user; #ifdef CONFIG_WATCH_QUEUE struct watch_queue *watch_queue; #endif }; /* * Note on the nesting of these functions: * * ->confirm() * ->try_steal() * * That is, ->try_steal() must be called on a confirmed buffer. See below for * the meaning of each operation. Also see the kerneldoc in fs/pipe.c for the * pipe and generic variants of these hooks. */ struct pipe_buf_operations { /* * ->confirm() verifies that the data in the pipe buffer is there * and that the contents are good. If the pages in the pipe belong * to a file system, we may need to wait for IO completion in this * hook. Returns 0 for good, or a negative error value in case of * error. If not present all pages are considered good. */ int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); /* * When the contents of this pipe buffer has been completely * consumed by a reader, ->release() is called. */ void (*release)(struct pipe_inode_info *, struct pipe_buffer *); /* * Attempt to take ownership of the pipe buffer and its contents. * ->try_steal() returns %true for success, in which case the contents * of the pipe (the buf->page) is locked and now completely owned by the * caller. The page may then be transferred to a different mapping, the * most often used case is insertion into different file address space * cache. */ bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *); /* * Get a reference to the pipe buffer. */ bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); }; /** * pipe_empty - Return true if the pipe is empty * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ static inline bool pipe_empty(unsigned int head, unsigned int tail) { return head == tail; } /** * pipe_occupancy - Return number of slots used in the pipe * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer */ static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) { return head - tail; } /** * pipe_full - Return true if the pipe is full * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer * @limit: The maximum amount of slots available. */ static inline bool pipe_full(unsigned int head, unsigned int tail, unsigned int limit) { return pipe_occupancy(head, tail) >= limit; } /** * pipe_space_for_user - Return number of slots available to userspace * @head: The pipe ring head pointer * @tail: The pipe ring tail pointer * @pipe: The pipe info structure */ static inline unsigned int pipe_space_for_user(unsigned int head, unsigned int tail, struct pipe_inode_info *pipe) { unsigned int p_occupancy, p_space; p_occupancy = pipe_occupancy(head, tail); if (p_occupancy >= pipe->max_usage) return 0; p_space = pipe->ring_size - p_occupancy; if (p_space > pipe->max_usage) p_space = pipe->max_usage; return p_space; } /** * pipe_buf_get - get a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to get a reference to * * Return: %true if the reference was successfully obtained. */ static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { return buf->ops->get(pipe, buf); } /** * pipe_buf_release - put a reference to a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to put a reference to */ static inline void pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { const struct pipe_buf_operations *ops = buf->ops; buf->ops = NULL; ops->release(pipe, buf); } /** * pipe_buf_confirm - verify contents of the pipe buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to confirm */ static inline int pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!buf->ops->confirm) return 0; return buf->ops->confirm(pipe, buf); } /** * pipe_buf_try_steal - attempt to take ownership of a pipe_buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to attempt to steal */ static inline bool pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { if (!buf->ops->try_steal) return false; return buf->ops->try_steal(pipe, buf); } /* Differs from PIPE_BUF in that PIPE_SIZE is the length of the actual memory allocation, whereas PIPE_BUF makes atomicity guarantees. */ #define PIPE_SIZE PAGE_SIZE /* Pipe lock and unlock operations */ void pipe_lock(struct pipe_inode_info *); void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); extern unsigned int pipe_max_size; extern unsigned long pipe_user_pages_hard; extern unsigned long pipe_user_pages_soft; /* Wait for a pipe to be readable/writable while dropping the pipe lock */ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); struct pipe_inode_info *alloc_pipe_info(void); void free_pipe_info(struct pipe_inode_info *); /* Generic pipe buffer ops functions */ bool generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); bool generic_pipe_buf_try_steal(struct pipe_inode_info *, struct pipe_buffer *); void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); extern const struct pipe_buf_operations nosteal_pipe_buf_ops; #ifdef CONFIG_WATCH_QUEUE unsigned long account_pipe_buffers(struct user_struct *user, unsigned long old, unsigned long new); bool too_many_pipe_buffers_soft(unsigned long user_bufs); bool too_many_pipe_buffers_hard(unsigned long user_bufs); bool pipe_is_unprivileged_user(void); #endif /* for F_SETPIPE_SZ and F_GETPIPE_SZ */ #ifdef CONFIG_WATCH_QUEUE int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); #endif long pipe_fcntl(struct file *, unsigned int, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); int create_pipe_files(struct file **, int); unsigned int round_pipe_size(unsigned long size); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef IOPRIO_H #define IOPRIO_H #include <linux/sched.h> #include <linux/sched/rt.h> #include <linux/iocontext.h> /* * Gives us 8 prio classes with 13-bits of data for each class */ #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) #define IOPRIO_PRIO_CLASS(mask) ((mask) >> IOPRIO_CLASS_SHIFT) #define IOPRIO_PRIO_DATA(mask) ((mask) & IOPRIO_PRIO_MASK) #define IOPRIO_PRIO_VALUE(class, data) (((class) << IOPRIO_CLASS_SHIFT) | data) #define ioprio_valid(mask) (IOPRIO_PRIO_CLASS((mask)) != IOPRIO_CLASS_NONE) /* * These are the io priority groups as implemented by CFQ. RT is the realtime * class, it always gets premium service. BE is the best-effort scheduling * class, the default for any process. IDLE is the idle scheduling class, it * is only served when no one else is using the disk. */ enum { IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE, }; /* * 8 best effort priority levels are supported */ #define IOPRIO_BE_NR (8) enum { IOPRIO_WHO_PROCESS = 1, IOPRIO_WHO_PGRP, IOPRIO_WHO_USER, }; /* * Fallback BE priority */ #define IOPRIO_NORM (4) /* * if process has set io priority explicitly, use that. if not, convert * the cpu scheduler nice value to an io priority */ static inline int task_nice_ioprio(struct task_struct *task) { return (task_nice(task) + 20) / 5; } /* * This is for the case where the task hasn't asked for a specific IO class. * Check for idle and rt task process, and return appropriate IO class. */ static inline int task_nice_ioclass(struct task_struct *task) { if (task->policy == SCHED_IDLE) return IOPRIO_CLASS_IDLE; else if (task_is_realtime(task)) return IOPRIO_CLASS_RT; else return IOPRIO_CLASS_BE; } /* * If the calling process has set an I/O priority, use that. Otherwise, return * the default I/O priority. */ static inline int get_current_ioprio(void) { struct io_context *ioc = current->io_context; if (ioc) return ioc->ioprio; return IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); } /* * For inheritance, return the highest of the two given priorities */ extern int ioprio_best(unsigned short aprio, unsigned short bprio); extern int set_task_ioprio(struct task_struct *task, int ioprio); #ifdef CONFIG_BLOCK extern int ioprio_check_cap(int ioprio); #else static inline int ioprio_check_cap(int ioprio) { return -ENOTBLK; } #endif /* CONFIG_BLOCK */ #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_CHECKSUM_64_H #define _ASM_X86_CHECKSUM_64_H /* * Checksums for x86-64 * Copyright 2002 by Andi Kleen, SuSE Labs * with some code from asm-x86/checksum.h */ #include <linux/compiler.h> #include <linux/uaccess.h> #include <asm/byteorder.h> /** * csum_fold - Fold and invert a 32bit checksum. * sum: 32bit unfolded sum * * Fold a 32bit running checksum to 16bit and invert it. This is usually * the last step before putting a checksum into a packet. * Make sure not to mix with 64bit checksums. */ static inline __sum16 csum_fold(__wsum sum) { asm(" addl %1,%0\n" " adcl $0xffff,%0" : "=r" (sum) : "r" ((__force u32)sum << 16), "0" ((__force u32)sum & 0xffff0000)); return (__force __sum16)(~(__force u32)sum >> 16); } /* * This is a version of ip_compute_csum() optimized for IP headers, * which always checksum on 4 octet boundaries. * * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by * Arnt Gulbrandsen. */ /** * ip_fast_csum - Compute the IPv4 header checksum efficiently. * iph: ipv4 header * ihl: length of header / 4 */ static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) { unsigned int sum; asm(" movl (%1), %0\n" " subl $4, %2\n" " jbe 2f\n" " addl 4(%1), %0\n" " adcl 8(%1), %0\n" " adcl 12(%1), %0\n" "1: adcl 16(%1), %0\n" " lea 4(%1), %1\n" " decl %2\n" " jne 1b\n" " adcl $0, %0\n" " movl %0, %2\n" " shrl $16, %0\n" " addw %w2, %w0\n" " adcl $0, %0\n" " notl %0\n" "2:" /* Since the input registers which are loaded with iph and ihl are modified, we must also specify them as outputs, or gcc will assume they contain their original values. */ : "=r" (sum), "=r" (iph), "=r" (ihl) : "1" (iph), "2" (ihl) : "memory"); return (__force __sum16)sum; } /** * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum. * @saddr: source address * @daddr: destination address * @len: length of packet * @proto: ip protocol of packet * @sum: initial sum to be added in (32bit unfolded) * * Returns the pseudo header checksum the input data. Result is * 32bit unfolded. */ static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { asm(" addl %1, %0\n" " adcl %2, %0\n" " adcl %3, %0\n" " adcl $0, %0\n" : "=r" (sum) : "g" (daddr), "g" (saddr), "g" ((len + proto)<<8), "0" (sum)); return sum; } /** * csum_tcpup_magic - Compute an IPv4 pseudo header checksum. * @saddr: source address * @daddr: destination address * @len: length of packet * @proto: ip protocol of packet * @sum: initial sum to be added in (32bit unfolded) * * Returns the 16bit pseudo header checksum the input data already * complemented and ready to be filled in. */ static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, __u32 len, __u8 proto, __wsum sum) { return csum_fold(csum_tcpudp_nofold(saddr, daddr, len, proto, sum)); } /** * csum_partial - Compute an internet checksum. * @buff: buffer to be checksummed * @len: length of buffer. * @sum: initial sum to be added in (32bit unfolded) * * Returns the 32bit unfolded internet checksum of the buffer. * Before filling it in it needs to be csum_fold()'ed. * buff should be aligned to a 64bit boundary if possible. */ extern __wsum csum_partial(const void *buff, int len, __wsum sum); /* Do not call this directly. Use the wrappers below */ extern __visible __wsum csum_partial_copy_generic(const void *src, void *dst, int len); extern __wsum csum_and_copy_from_user(const void __user *src, void *dst, int len); extern __wsum csum_and_copy_to_user(const void *src, void __user *dst, int len); extern __wsum csum_partial_copy_nocheck(const void *src, void *dst, int len); /** * ip_compute_csum - Compute an 16bit IP checksum. * @buff: buffer address. * @len: length of buffer. * * Returns the 16bit folded/inverted checksum of the passed buffer. * Ready to fill in. */ extern __sum16 ip_compute_csum(const void *buff, int len); /** * csum_ipv6_magic - Compute checksum of an IPv6 pseudo header. * @saddr: source address * @daddr: destination address * @len: length of packet * @proto: protocol of packet * @sum: initial sum (32bit unfolded) to be added in * * Computes an IPv6 pseudo header checksum. This sum is added the checksum * into UDP/TCP packets and contains some link layer information. * Returns the unfolded 32bit checksum. */ struct in6_addr; #define _HAVE_ARCH_IPV6_CSUM 1 extern __sum16 csum_ipv6_magic(const struct in6_addr *saddr, const struct in6_addr *daddr, __u32 len, __u8 proto, __wsum sum); static inline unsigned add32_with_carry(unsigned a, unsigned b) { asm("addl %2,%0\n\t" "adcl $0,%0" : "=r" (a) : "0" (a), "rm" (b)); return a; } #define HAVE_ARCH_CSUM_ADD static inline __wsum csum_add(__wsum csum, __wsum addend) { return (__force __wsum)add32_with_carry((__force unsigned)csum, (__force unsigned)addend); } #endif /* _ASM_X86_CHECKSUM_64_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_COMPAT_H #define _ASM_X86_COMPAT_H /* * Architecture specific compatibility types */ #include <linux/types.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <asm/processor.h> #include <asm/user32.h> #include <asm/unistd.h> #include <asm-generic/compat.h> #define COMPAT_USER_HZ 100 #define COMPAT_UTS_MACHINE "i686\0\0" typedef u16 __compat_uid_t; typedef u16 __compat_gid_t; typedef u32 __compat_uid32_t; typedef u32 __compat_gid32_t; typedef u16 compat_mode_t; typedef u16 compat_dev_t; typedef u16 compat_nlink_t; typedef u16 compat_ipc_pid_t; typedef u32 compat_caddr_t; typedef __kernel_fsid_t compat_fsid_t; struct compat_stat { compat_dev_t st_dev; u16 __pad1; compat_ino_t st_ino; compat_mode_t st_mode; compat_nlink_t st_nlink; __compat_uid_t st_uid; __compat_gid_t st_gid; compat_dev_t st_rdev; u16 __pad2; u32 st_size; u32 st_blksize; u32 st_blocks; u32 st_atime; u32 st_atime_nsec; u32 st_mtime; u32 st_mtime_nsec; u32 st_ctime; u32 st_ctime_nsec; u32 __unused4; u32 __unused5; }; struct compat_flock { short l_type; short l_whence; compat_off_t l_start; compat_off_t l_len; compat_pid_t l_pid; }; #define F_GETLK64 12 /* using 'struct flock64' */ #define F_SETLK64 13 #define F_SETLKW64 14 /* * IA32 uses 4 byte alignment for 64 bit quantities, * so we need to pack this structure. */ struct compat_flock64 { short l_type; short l_whence; compat_loff_t l_start; compat_loff_t l_len; compat_pid_t l_pid; } __attribute__((packed)); struct compat_statfs { int f_type; int f_bsize; int f_blocks; int f_bfree; int f_bavail; int f_files; int f_ffree; compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; int f_flags; int f_spare[4]; }; #define COMPAT_RLIM_INFINITY 0xffffffff typedef u32 compat_old_sigset_t; /* at least 32 bits */ #define _COMPAT_NSIG 64 #define _COMPAT_NSIG_BPW 32 typedef u32 compat_sigset_word; #define COMPAT_OFF_T_MAX 0x7fffffff struct compat_ipc64_perm { compat_key_t key; __compat_uid32_t uid; __compat_gid32_t gid; __compat_uid32_t cuid; __compat_gid32_t cgid; unsigned short mode; unsigned short __pad1; unsigned short seq; unsigned short __pad2; compat_ulong_t unused1; compat_ulong_t unused2; }; struct compat_semid64_ds { struct compat_ipc64_perm sem_perm; compat_ulong_t sem_otime; compat_ulong_t sem_otime_high; compat_ulong_t sem_ctime; compat_ulong_t sem_ctime_high; compat_ulong_t sem_nsems; compat_ulong_t __unused3; compat_ulong_t __unused4; }; struct compat_msqid64_ds { struct compat_ipc64_perm msg_perm; compat_ulong_t msg_stime; compat_ulong_t msg_stime_high; compat_ulong_t msg_rtime; compat_ulong_t msg_rtime_high; compat_ulong_t msg_ctime; compat_ulong_t msg_ctime_high; compat_ulong_t msg_cbytes; compat_ulong_t msg_qnum; compat_ulong_t msg_qbytes; compat_pid_t msg_lspid; compat_pid_t msg_lrpid; compat_ulong_t __unused4; compat_ulong_t __unused5; }; struct compat_shmid64_ds { struct compat_ipc64_perm shm_perm; compat_size_t shm_segsz; compat_ulong_t shm_atime; compat_ulong_t shm_atime_high; compat_ulong_t shm_dtime; compat_ulong_t shm_dtime_high; compat_ulong_t shm_ctime; compat_ulong_t shm_ctime_high; compat_pid_t shm_cpid; compat_pid_t shm_lpid; compat_ulong_t shm_nattch; compat_ulong_t __unused4; compat_ulong_t __unused5; }; /* * The type of struct elf_prstatus.pr_reg in compatible core dumps. */ typedef struct user_regs_struct compat_elf_gregset_t; /* Full regset -- prstatus on x32, otherwise on ia32 */ #define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296) #define SET_PR_FPVALID(S, V, R) \ do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \ while (0) #ifdef CONFIG_X86_X32_ABI #define COMPAT_USE_64BIT_TIME \ (!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT)) #endif static inline void __user *arch_compat_alloc_user_space(long len) { compat_uptr_t sp; if (test_thread_flag(TIF_IA32)) { sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ sp = task_pt_regs(current)->sp - 128; } return (void __user *)round_down(sp - len, 16); } static inline bool in_x32_syscall(void) { #ifdef CONFIG_X86_X32_ABI if (task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT) return true; #endif return false; } static inline bool in_32bit_syscall(void) { return in_ia32_syscall() || in_x32_syscall(); } #ifdef CONFIG_COMPAT static inline bool in_compat_syscall(void) { return in_32bit_syscall(); } #define in_compat_syscall in_compat_syscall /* override the generic impl */ #define compat_need_64bit_alignment_fixup in_ia32_syscall #endif struct compat_siginfo; #ifdef CONFIG_X86_X32_ABI int copy_siginfo_to_user32(struct compat_siginfo __user *to, const kernel_siginfo_t *from); #define copy_siginfo_to_user32 copy_siginfo_to_user32 #endif /* CONFIG_X86_X32_ABI */ #endif /* _ASM_X86_COMPAT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_TASK_H #define _LINUX_SCHED_TASK_H /* * Interface between the scheduler and various task lifetime (fork()/exit()) * functionality: */ #include <linux/sched.h> #include <linux/uaccess.h> struct task_struct; struct rusage; union thread_union; struct css_set; /* All the bits taken by the old clone syscall. */ #define CLONE_LEGACY_FLAGS 0xffffffffULL struct kernel_clone_args { u64 flags; int __user *pidfd; int __user *child_tid; int __user *parent_tid; int exit_signal; unsigned long stack; unsigned long stack_size; unsigned long tls; pid_t *set_tid; /* Number of elements in *set_tid */ size_t set_tid_size; int cgroup; struct cgroup *cgrp; struct css_set *cset; }; /* * This serializes "schedule()" and also protects * the run-queue from deletions/modifications (but * _adding_ to the beginning of the run-queue has * a separate lock). */ extern rwlock_t tasklist_lock; extern spinlock_t mmlist_lock; extern union thread_union init_thread_union; extern struct task_struct init_task; #ifdef CONFIG_PROVE_RCU extern int lockdep_tasklist_lock_is_held(void); #endif /* #ifdef CONFIG_PROVE_RCU */ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); extern void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); extern void proc_caches_init(void); extern void fork_init(void); extern void release_task(struct task_struct * p); extern int copy_thread(unsigned long, unsigned long, unsigned long, struct task_struct *, unsigned long); extern void flush_thread(void); #ifdef CONFIG_HAVE_EXIT_THREAD extern void exit_thread(struct task_struct *tsk); #else static inline void exit_thread(struct task_struct *tsk) { } #endif extern void do_group_exit(int); extern void exit_files(struct task_struct *); extern void exit_itimers(struct signal_struct *); extern pid_t kernel_clone(struct kernel_clone_args *kargs); struct task_struct *fork_idle(int); struct mm_struct *copy_init_mm(void); extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags); extern long kernel_wait4(pid_t, int __user *, int, struct rusage *); int kernel_wait(pid_t pid, int *stat); extern void free_task(struct task_struct *tsk); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP extern void sched_exec(void); #else #define sched_exec() {} #endif static inline struct task_struct *get_task_struct(struct task_struct *t) { refcount_inc(&t->usage); return t; } extern void __put_task_struct(struct task_struct *t); static inline void put_task_struct(struct task_struct *t) { if (refcount_dec_and_test(&t->usage)) __put_task_struct(t); } static inline void put_task_struct_many(struct task_struct *t, int nr) { if (refcount_sub_and_test(nr, &t->usage)) __put_task_struct(t); } void put_task_struct_rcu_user(struct task_struct *task); #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT extern int arch_task_struct_size __read_mostly; #else # define arch_task_struct_size (sizeof(struct task_struct)) #endif #ifndef CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST /* * If an architecture has not declared a thread_struct whitelist we * must assume something there may need to be copied to userspace. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = 0; /* Handle dynamically sized thread_struct. */ *size = arch_task_struct_size - offsetof(struct task_struct, thread); } #endif #ifdef CONFIG_VMAP_STACK static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return t->stack_vm_area; } #else static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) { return NULL; } #endif /* * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring * subscriptions and synchronises with wait4(). Also used in procfs. Also * pins the final release of task.io_context. Also protects ->cpuset and * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. * * Nests both inside and outside of read_lock(&tasklist_lock). * It must not be nested with write_lock_irq(&tasklist_lock), * neither inside nor outside. */ static inline void task_lock(struct task_struct *p) { spin_lock(&p->alloc_lock); } static inline void task_unlock(struct task_struct *p) { spin_unlock(&p->alloc_lock); } #endif /* _LINUX_SCHED_TASK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_SPECIAL_INSNS_H #define _ASM_X86_SPECIAL_INSNS_H #ifdef __KERNEL__ #include <asm/nops.h> #include <asm/processor-flags.h> #include <linux/irqflags.h> #include <linux/jump_label.h> /* * The compiler should not reorder volatile asm statements with respect to each * other: they should execute in program order. However GCC 4.9.x and 5.x have * a bug (which was fixed in 8.1, 7.3 and 6.5) where they might reorder * volatile asm. The write functions are not affected since they have memory * clobbers preventing reordering. To prevent reads from being reordered with * respect to writes, use a dummy memory operand. */ #define __FORCE_ORDER "m"(*(unsigned int *)0x1000UL) void native_write_cr0(unsigned long val); static inline unsigned long native_read_cr0(void) { unsigned long val; asm volatile("mov %%cr0,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline unsigned long native_read_cr2(void) { unsigned long val; asm volatile("mov %%cr2,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static __always_inline void native_write_cr2(unsigned long val) { asm volatile("mov %0,%%cr2": : "r" (val) : "memory"); } static inline unsigned long __native_read_cr3(void) { unsigned long val; asm volatile("mov %%cr3,%0\n\t" : "=r" (val) : __FORCE_ORDER); return val; } static inline void native_write_cr3(unsigned long val) { asm volatile("mov %0,%%cr3": : "r" (val) : "memory"); } static inline unsigned long native_read_cr4(void) { unsigned long val; #ifdef CONFIG_X86_32 /* * This could fault if CR4 does not exist. Non-existent CR4 * is functionally equivalent to CR4 == 0. Keep it simple and pretend * that CR4 == 0 on CPUs that don't have CR4. */ asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) : "=r" (val) : "0" (0), __FORCE_ORDER); #else /* CR4 always exists on x86_64. */ asm volatile("mov %%cr4,%0\n\t" : "=r" (val) : __FORCE_ORDER); #endif return val; } void native_write_cr4(unsigned long val); #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS static inline u32 rdpkru(void) { u32 ecx = 0; u32 edx, pkru; /* * "rdpkru" instruction. Places PKRU contents in to EAX, * clears EDX and requires that ecx=0. */ asm volatile(".byte 0x0f,0x01,0xee\n\t" : "=a" (pkru), "=d" (edx) : "c" (ecx)); return pkru; } static inline void wrpkru(u32 pkru) { u32 ecx = 0, edx = 0; /* * "wrpkru" instruction. Loads contents in EAX to PKRU, * requires that ecx = edx = 0. */ asm volatile(".byte 0x0f,0x01,0xef\n\t" : : "a" (pkru), "c"(ecx), "d"(edx)); } static inline void __write_pkru(u32 pkru) { /* * WRPKRU is relatively expensive compared to RDPKRU. * Avoid WRPKRU when it would not change the value. */ if (pkru == rdpkru()) return; wrpkru(pkru); } #else static inline u32 rdpkru(void) { return 0; } static inline void __write_pkru(u32 pkru) { } #endif static inline void native_wbinvd(void) { asm volatile("wbinvd": : :"memory"); } extern asmlinkage void asm_load_gs_index(unsigned int selector); static inline void native_load_gs_index(unsigned int selector) { unsigned long flags; local_irq_save(flags); asm_load_gs_index(selector); local_irq_restore(flags); } static inline unsigned long __read_cr4(void) { return native_read_cr4(); } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else static inline unsigned long read_cr0(void) { return native_read_cr0(); } static inline void write_cr0(unsigned long x) { native_write_cr0(x); } static __always_inline unsigned long read_cr2(void) { return native_read_cr2(); } static __always_inline void write_cr2(unsigned long x) { native_write_cr2(x); } /* * Careful! CR3 contains more than just an address. You probably want * read_cr3_pa() instead. */ static inline unsigned long __read_cr3(void) { return __native_read_cr3(); } static inline void write_cr3(unsigned long x) { native_write_cr3(x); } static inline void __write_cr4(unsigned long x) { native_write_cr4(x); } static inline void wbinvd(void) { native_wbinvd(); } #ifdef CONFIG_X86_64 static inline void load_gs_index(unsigned int selector) { native_load_gs_index(selector); } #endif #endif /* CONFIG_PARAVIRT_XXL */ static inline void clflush(volatile void *__p) { asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); } static inline void clflushopt(volatile void *__p) { alternative_io(".byte " __stringify(NOP_DS_PREFIX) "; clflush %P0", ".byte 0x66; clflush %P0", X86_FEATURE_CLFLUSHOPT, "+m" (*(volatile char __force *)__p)); } static inline void clwb(volatile void *__p) { volatile struct { char x[64]; } *p = __p; asm volatile(ALTERNATIVE_2( ".byte " __stringify(NOP_DS_PREFIX) "; clflush (%[pax])", ".byte 0x66; clflush (%[pax])", /* clflushopt (%%rax) */ X86_FEATURE_CLFLUSHOPT, ".byte 0x66, 0x0f, 0xae, 0x30", /* clwb (%%rax) */ X86_FEATURE_CLWB) : [p] "+m" (*p) : [pax] "a" (p)); } #define nop() asm volatile ("nop") static inline void serialize(void) { /* Instruction opcode for SERIALIZE; supported in binutils >= 2.35. */ asm volatile(".byte 0xf, 0x1, 0xe8" ::: "memory"); } /* The dst parameter must be 64-bytes aligned */ static inline void movdir64b(void *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } *__dst = dst; /* * MOVDIR64B %(rdx), rax. * * Both __src and __dst must be memory constraints in order to tell the * compiler that no other memory accesses should be reordered around * this one. * * Also, both must be supplied as lvalues because this tells * the compiler what the object is (its size) the instruction accesses. * I.e., not the pointers but what they point to, thus the deref'ing '*'. */ asm volatile(".byte 0x66, 0x0f, 0x38, 0xf8, 0x02" : "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); } /** * enqcmds - Enqueue a command in supervisor (CPL0) mode * @dst: destination, in MMIO space (must be 512-bit aligned) * @src: 512 bits memory operand * * The ENQCMDS instruction allows software to write a 512-bit command to * a 512-bit-aligned special MMIO region that supports the instruction. * A return status is loaded into the ZF flag in the RFLAGS register. * ZF = 0 equates to success, and ZF = 1 indicates retry or error. * * This function issues the ENQCMDS instruction to submit data from * kernel space to MMIO space, in a unit of 512 bits. Order of data access * is not guaranteed, nor is a memory barrier performed afterwards. It * returns 0 on success and -EAGAIN on failure. * * Warning: Do not use this helper unless your driver has checked that the * ENQCMDS instruction is supported on the platform and the device accepts * ENQCMDS. */ static inline int enqcmds(void __iomem *dst, const void *src) { const struct { char _[64]; } *__src = src; struct { char _[64]; } __iomem *__dst = dst; bool zf; /* * ENQCMDS %(rdx), rax * * See movdir64b()'s comment on operand specification. */ asm volatile(".byte 0xf3, 0x0f, 0x38, 0xf8, 0x02, 0x66, 0x90" CC_SET(z) : CC_OUT(z) (zf), "+m" (*__dst) : "m" (*__src), "a" (__dst), "d" (__src)); /* Submission failure is indicated via EFLAGS.ZF=1 */ if (zf) return -EAGAIN; return 0; } #endif /* __KERNEL__ */ #endif /* _ASM_X86_SPECIAL_INSNS_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PAGEMAP_H #define _LINUX_PAGEMAP_H /* * Copyright 1995 Linus Torvalds */ #include <linux/mm.h> #include <linux/fs.h> #include <linux/list.h> #include <linux/highmem.h> #include <linux/compiler.h> #include <linux/uaccess.h> #include <linux/gfp.h> #include <linux/bitops.h> #include <linux/hardirq.h> /* for in_interrupt() */ #include <linux/hugetlb_inline.h> struct pagevec; /* * Bits in mapping->flags. */ enum mapping_flags { AS_EIO = 0, /* IO error on async write */ AS_ENOSPC = 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */ AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */ AS_EXITING = 4, /* final truncate in progress */ /* writeback related tags are not used */ AS_NO_WRITEBACK_TAGS = 5, AS_THP_SUPPORT = 6, /* THPs supported */ }; /** * mapping_set_error - record a writeback error in the address_space * @mapping: the mapping in which an error should be set * @error: the error to set in the mapping * * When writeback fails in some way, we must record that error so that * userspace can be informed when fsync and the like are called. We endeavor * to report errors on any file that was open at the time of the error. Some * internal callers also need to know when writeback errors have occurred. * * When a writeback error occurs, most filesystems will want to call * mapping_set_error to record the error in the mapping so that it can be * reported when the application calls fsync(2). */ static inline void mapping_set_error(struct address_space *mapping, int error) { if (likely(!error)) return; /* Record in wb_err for checkers using errseq_t based tracking */ __filemap_set_wb_err(mapping, error); /* Record it in superblock */ if (mapping->host) errseq_set(&mapping->host->i_sb->s_wb_err, error); /* Record it in flags for now, for legacy callers */ if (error == -ENOSPC) set_bit(AS_ENOSPC, &mapping->flags); else set_bit(AS_EIO, &mapping->flags); } static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_clear_unevictable(struct address_space *mapping) { clear_bit(AS_UNEVICTABLE, &mapping->flags); } static inline bool mapping_unevictable(struct address_space *mapping) { return mapping && test_bit(AS_UNEVICTABLE, &mapping->flags); } static inline void mapping_set_exiting(struct address_space *mapping) { set_bit(AS_EXITING, &mapping->flags); } static inline int mapping_exiting(struct address_space *mapping) { return test_bit(AS_EXITING, &mapping->flags); } static inline void mapping_set_no_writeback_tags(struct address_space *mapping) { set_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline int mapping_use_writeback_tags(struct address_space *mapping) { return !test_bit(AS_NO_WRITEBACK_TAGS, &mapping->flags); } static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { return mapping->gfp_mask; } /* Restricts the given gfp_mask to what the mapping allows. */ static inline gfp_t mapping_gfp_constraint(struct address_space *mapping, gfp_t gfp_mask) { return mapping_gfp_mask(mapping) & gfp_mask; } /* * This is non-atomic. Only to be used before the mapping is activated. * Probably needs a barrier... */ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) { m->gfp_mask = mask; } static inline bool mapping_thp_support(struct address_space *mapping) { return test_bit(AS_THP_SUPPORT, &mapping->flags); } static inline int filemap_nr_thps(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS return atomic_read(&mapping->nr_thps); #else return 0; #endif } static inline void filemap_nr_thps_inc(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_thp_support(mapping)) atomic_inc(&mapping->nr_thps); #else WARN_ON_ONCE(1); #endif } static inline void filemap_nr_thps_dec(struct address_space *mapping) { #ifdef CONFIG_READ_ONLY_THP_FOR_FS if (!mapping_thp_support(mapping)) atomic_dec(&mapping->nr_thps); #else WARN_ON_ONCE(1); #endif } void release_pages(struct page **pages, int nr); /* * speculatively take a reference to a page. * If the page is free (_refcount == 0), then _refcount is untouched, and 0 * is returned. Otherwise, _refcount is incremented by 1 and 1 is returned. * * This function must be called inside the same rcu_read_lock() section as has * been used to lookup the page in the pagecache radix-tree (or page table): * this allows allocators to use a synchronize_rcu() to stabilize _refcount. * * Unless an RCU grace period has passed, the count of all pages coming out * of the allocator must be considered unstable. page_count may return higher * than expected, and put_page must be able to do the right thing when the * page has been finished with, no matter what it is subsequently allocated * for (because put_page is what is used here to drop an invalid speculative * reference). * * This is the interesting part of the lockless pagecache (and lockless * get_user_pages) locking protocol, where the lookup-side (eg. find_get_page) * has the following pattern: * 1. find page in radix tree * 2. conditionally increment refcount * 3. check the page is still in pagecache (if no, goto 1) * * Remove-side that cares about stability of _refcount (eg. reclaim) has the * following (with the i_pages lock held): * A. atomically check refcount is correct and set it to 0 (atomic_cmpxchg) * B. remove page from pagecache * C. free the page * * There are 2 critical interleavings that matter: * - 2 runs before A: in this case, A sees elevated refcount and bails out * - A runs before 2: in this case, 2 sees zero refcount and retries; * subsequently, B will complete and 1 will find no page, causing the * lookup to return NULL. * * It is possible that between 1 and 2, the page is removed then the exact same * page is inserted into the same position in pagecache. That's OK: the * old find_get_page using a lock could equally have run before or after * such a re-insertion, depending on order that locks are granted. * * Lookups racing against pagecache insertion isn't a big problem: either 1 * will find the page or it will not. Likewise, the old find_get_page could run * either before the insertion or afterwards, depending on timing. */ static inline int __page_cache_add_speculative(struct page *page, int count) { #ifdef CONFIG_TINY_RCU # ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic() && !irqs_disabled()); # endif /* * Preempt must be disabled here - we rely on rcu_read_lock doing * this for us. * * Pagecache won't be truncated from interrupt context, so if we have * found a page in the radix tree here, we have pinned its refcount by * disabling preempt, and hence no need for the "speculative get" that * SMP requires. */ VM_BUG_ON_PAGE(page_count(page) == 0, page); page_ref_add(page, count); #else if (unlikely(!page_ref_add_unless(page, count, 0))) { /* * Either the page has been freed, or will be freed. * In either case, retry here and the caller should * do the right thing (see comments above). */ return 0; } #endif VM_BUG_ON_PAGE(PageTail(page), page); return 1; } static inline int page_cache_get_speculative(struct page *page) { return __page_cache_add_speculative(page, 1); } static inline int page_cache_add_speculative(struct page *page, int count) { return __page_cache_add_speculative(page, count); } /** * attach_page_private - Attach private data to a page. * @page: Page to attach data to. * @data: Data to attach to page. * * Attaching private data to a page increments the page's reference count. * The data must be detached before the page will be freed. */ static inline void attach_page_private(struct page *page, void *data) { get_page(page); set_page_private(page, (unsigned long)data); SetPagePrivate(page); } /** * detach_page_private - Detach private data from a page. * @page: Page to detach data from. * * Removes the data that was previously attached to the page and decrements * the refcount on the page. * * Return: Data that was attached to the page. */ static inline void *detach_page_private(struct page *page) { void *data = (void *)page_private(page); if (!PagePrivate(page)) return NULL; ClearPagePrivate(page); set_page_private(page, 0); put_page(page); return data; } #ifdef CONFIG_NUMA extern struct page *__page_cache_alloc(gfp_t gfp); #else static inline struct page *__page_cache_alloc(gfp_t gfp) { return alloc_pages(gfp, 0); } #endif static inline struct page *page_cache_alloc(struct address_space *x) { return __page_cache_alloc(mapping_gfp_mask(x)); } static inline gfp_t readahead_gfp_mask(struct address_space *x) { return mapping_gfp_mask(x) | __GFP_NORETRY | __GFP_NOWARN; } typedef int filler_t(void *, struct page *); pgoff_t page_cache_next_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); pgoff_t page_cache_prev_miss(struct address_space *mapping, pgoff_t index, unsigned long max_scan); #define FGP_ACCESSED 0x00000001 #define FGP_LOCK 0x00000002 #define FGP_CREAT 0x00000004 #define FGP_WRITE 0x00000008 #define FGP_NOFS 0x00000010 #define FGP_NOWAIT 0x00000020 #define FGP_FOR_MMAP 0x00000040 #define FGP_HEAD 0x00000080 struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, int fgp_flags, gfp_t cache_gfp_mask); /** * find_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned with an increased refcount. * * Otherwise, %NULL is returned. */ static inline struct page *find_get_page(struct address_space *mapping, pgoff_t offset) { return pagecache_get_page(mapping, offset, 0, 0); } static inline struct page *find_get_page_flags(struct address_space *mapping, pgoff_t offset, int fgp_flags) { return pagecache_get_page(mapping, offset, fgp_flags, 0); } /** * find_lock_page - locate, pin and lock a pagecache page * @mapping: the address_space to search * @index: the page index * * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, it is returned locked and with an increased * refcount. * * Context: May sleep. * Return: A struct page or %NULL if there is no page in the cache for this * index. */ static inline struct page *find_lock_page(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK, 0); } /** * find_lock_head - Locate, pin and lock a pagecache page. * @mapping: The address_space to search. * @index: The page index. * * Looks up the page cache entry at @mapping & @index. If there is a * page cache page, its head page is returned locked and with an increased * refcount. * * Context: May sleep. * Return: A struct page which is !PageTail, or %NULL if there is no page * in the cache for this index. */ static inline struct page *find_lock_head(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK | FGP_HEAD, 0); } /** * find_or_create_page - locate or add a pagecache page * @mapping: the page's address_space * @index: the page's index into the mapping * @gfp_mask: page allocation mode * * Looks up the page cache slot at @mapping & @offset. If there is a * page cache page, it is returned locked and with an increased * refcount. * * If the page is not present, a new page is allocated using @gfp_mask * and added to the page cache and the VM's LRU list. The page is * returned locked and with an increased refcount. * * On memory exhaustion, %NULL is returned. * * find_or_create_page() may sleep, even if @gfp_flags specifies an * atomic allocation! */ static inline struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_ACCESSED|FGP_CREAT, gfp_mask); } /** * grab_cache_page_nowait - returns locked page at given index in given cache * @mapping: target address_space * @index: the page index * * Same as grab_cache_page(), but do not wait if the page is unavailable. * This is intended for speculative data generators, where the data can * be regenerated if the page couldn't be grabbed. This routine should * be safe to call while holding the lock for another page. * * Clear __GFP_FS when allocating the page to avoid recursion into the fs * and deadlock against the caller's locked page. */ static inline struct page *grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) { return pagecache_get_page(mapping, index, FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, mapping_gfp_mask(mapping)); } /* Does this page contain this index? */ static inline bool thp_contains(struct page *head, pgoff_t index) { /* HugeTLBfs indexes the page cache in units of hpage_size */ if (PageHuge(head)) return head->index == index; return page_index(head) == (index & ~(thp_nr_pages(head) - 1UL)); } /* * Given the page we found in the page cache, return the page corresponding * to this index in the file */ static inline struct page *find_subpage(struct page *head, pgoff_t index) { /* HugeTLBfs wants the head page regardless */ if (PageHuge(head)) return head; return head + (index & (thp_nr_pages(head) - 1)); } unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); unsigned find_get_pages_range(struct address_space *mapping, pgoff_t *start, pgoff_t end, unsigned int nr_pages, struct page **pages); static inline unsigned find_get_pages(struct address_space *mapping, pgoff_t *start, unsigned int nr_pages, struct page **pages) { return find_get_pages_range(mapping, start, (pgoff_t)-1, nr_pages, pages); } unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_range_tag(struct address_space *mapping, pgoff_t *index, pgoff_t end, xa_mark_t tag, unsigned int nr_pages, struct page **pages); static inline unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, xa_mark_t tag, unsigned int nr_pages, struct page **pages) { return find_get_pages_range_tag(mapping, index, (pgoff_t)-1, tag, nr_pages, pages); } struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); /* * Returns locked page at given index in given cache, creating it if needed. */ static inline struct page *grab_cache_page(struct address_space *mapping, pgoff_t index) { return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); } extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern int read_cache_pages(struct address_space *mapping, struct list_head *pages, filler_t *filler, void *data); static inline struct page *read_mapping_page(struct address_space *mapping, pgoff_t index, void *data) { return read_cache_page(mapping, index, NULL, data); } /* * Get index of the page within radix-tree (but not for hugetlb pages). * (TODO: remove once hugetlb pages will have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_index(struct page *page) { pgoff_t pgoff; if (likely(!PageTransTail(page))) return page->index; /* * We don't initialize ->index for tail pages: calculate based on * head page */ pgoff = compound_head(page)->index; pgoff += page - compound_head(page); return pgoff; } extern pgoff_t hugetlb_basepage_index(struct page *page); /* * Get the offset in PAGE_SIZE (even for hugetlb pages). * (TODO: hugetlb pages should have ->index in PAGE_SIZE) */ static inline pgoff_t page_to_pgoff(struct page *page) { if (unlikely(PageHuge(page))) return hugetlb_basepage_index(page); return page_to_index(page); } /* * Return byte-offset into filesystem object for page. */ static inline loff_t page_offset(struct page *page) { return ((loff_t)page->index) << PAGE_SHIFT; } static inline loff_t page_file_offset(struct page *page) { return ((loff_t)page_index(page)) << PAGE_SHIFT; } extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, unsigned long address); static inline pgoff_t linear_page_index(struct vm_area_struct *vma, unsigned long address) { pgoff_t pgoff; if (unlikely(is_vm_hugetlb_page(vma))) return linear_hugepage_index(vma, address); pgoff = (address - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; return pgoff; } struct wait_page_key { struct page *page; int bit_nr; int page_match; }; struct wait_page_queue { struct page *page; int bit_nr; wait_queue_entry_t wait; }; static inline bool wake_page_match(struct wait_page_queue *wait_page, struct wait_page_key *key) { if (wait_page->page != key->page) return false; key->page_match = 1; if (wait_page->bit_nr != key->bit_nr) return false; return true; } extern void __lock_page(struct page *page); extern int __lock_page_killable(struct page *page); extern int __lock_page_async(struct page *page, struct wait_page_queue *wait); extern int __lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags); extern void unlock_page(struct page *page); /* * Return true if the page was successfully locked */ static inline int trylock_page(struct page *page) { page = compound_head(page); return (likely(!test_and_set_bit_lock(PG_locked, &page->flags))); } /* * lock_page may only be called if we have the page's inode pinned. */ static inline void lock_page(struct page *page) { might_sleep(); if (!trylock_page(page)) __lock_page(page); } /* * lock_page_killable is like lock_page but can be interrupted by fatal * signals. It returns 0 if it locked the page and -EINTR if it was * killed while waiting. */ static inline int lock_page_killable(struct page *page) { might_sleep(); if (!trylock_page(page)) return __lock_page_killable(page); return 0; } /* * lock_page_async - Lock the page, unless this would block. If the page * is already locked, then queue a callback when the page becomes unlocked. * This callback can then retry the operation. * * Returns 0 if the page is locked successfully, or -EIOCBQUEUED if the page * was already locked and the callback defined in 'wait' was queued. */ static inline int lock_page_async(struct page *page, struct wait_page_queue *wait) { if (!trylock_page(page)) return __lock_page_async(page, wait); return 0; } /* * lock_page_or_retry - Lock the page, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __lock_page_or_retry(). */ static inline int lock_page_or_retry(struct page *page, struct mm_struct *mm, unsigned int flags) { might_sleep(); return trylock_page(page) || __lock_page_or_retry(page, mm, flags); } /* * This is exported only for wait_on_page_locked/wait_on_page_writeback, etc., * and should not be used directly. */ extern void wait_on_page_bit(struct page *page, int bit_nr); extern int wait_on_page_bit_killable(struct page *page, int bit_nr); /* * Wait for a page to be unlocked. * * This must be called with the caller "holding" the page, * ie with increased "page->count" so that the page won't * go away during the wait.. */ static inline void wait_on_page_locked(struct page *page) { if (PageLocked(page)) wait_on_page_bit(compound_head(page), PG_locked); } static inline int wait_on_page_locked_killable(struct page *page) { if (!PageLocked(page)) return 0; return wait_on_page_bit_killable(compound_head(page), PG_locked); } extern void put_and_wait_on_page_locked(struct page *page); void wait_on_page_writeback(struct page *page); extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); void page_endio(struct page *page, bool is_write, int err); /* * Add an arbitrary waiter to a page's wait queue */ extern void add_page_wait_queue(struct page *page, wait_queue_entry_t *waiter); /* * Fault everything in given userspace address range in. */ static inline int fault_in_pages_writeable(char __user *uaddr, int size) { char __user *end = uaddr + size - 1; if (unlikely(size == 0)) return 0; if (unlikely(uaddr > end)) return -EFAULT; /* * Writing zeroes into userspace here is OK, because we know that if * the zero gets there, we'll be overwriting it. */ do { if (unlikely(__put_user(0, uaddr) != 0)) return -EFAULT; uaddr += PAGE_SIZE; } while (uaddr <= end); /* Check whether the range spilled into the next page. */ if (((unsigned long)uaddr & PAGE_MASK) == ((unsigned long)end & PAGE_MASK)) return __put_user(0, end); return 0; } static inline int fault_in_pages_readable(const char __user *uaddr, int size) { volatile char c; const char __user *end = uaddr + size - 1; if (unlikely(size == 0)) return 0; if (unlikely(uaddr > end)) return -EFAULT; do { if (unlikely(__get_user(c, uaddr) != 0)) return -EFAULT; uaddr += PAGE_SIZE; } while (uaddr <= end); /* Check whether the range spilled into the next page. */ if (((unsigned long)uaddr & PAGE_MASK) == ((unsigned long)end & PAGE_MASK)) { return __get_user(c, end); } (void)c; return 0; } int add_to_page_cache_locked(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp_mask); extern void delete_from_page_cache(struct page *page); extern void __delete_from_page_cache(struct page *page, void *shadow); int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask); void delete_from_page_cache_batch(struct address_space *mapping, struct pagevec *pvec); /* * Like add_to_page_cache_locked, but used to add newly allocated pages: * the page is new, so we can just run __SetPageLocked() against it. */ static inline int add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { int error; __SetPageLocked(page); error = add_to_page_cache_locked(page, mapping, offset, gfp_mask); if (unlikely(error)) __ClearPageLocked(page); return error; } /** * struct readahead_control - Describes a readahead request. * * A readahead request is for consecutive pages. Filesystems which * implement the ->readahead method should call readahead_page() or * readahead_page_batch() in a loop and attempt to start I/O against * each page in the request. * * Most of the fields in this struct are private and should be accessed * by the functions below. * * @file: The file, used primarily by network filesystems for authentication. * May be NULL if invoked internally by the filesystem. * @mapping: Readahead this filesystem object. */ struct readahead_control { struct file *file; struct address_space *mapping; /* private: use the readahead_* accessors instead */ pgoff_t _index; unsigned int _nr_pages; unsigned int _batch_count; }; #define DEFINE_READAHEAD(rac, f, m, i) \ struct readahead_control rac = { \ .file = f, \ .mapping = m, \ ._index = i, \ } #define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) void page_cache_ra_unbounded(struct readahead_control *, unsigned long nr_to_read, unsigned long lookahead_count); void page_cache_sync_ra(struct readahead_control *, struct file_ra_state *, unsigned long req_count); void page_cache_async_ra(struct readahead_control *, struct file_ra_state *, struct page *, unsigned long req_count); /** * page_cache_sync_readahead - generic file readahead * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @index: Index of first page to be read. * @req_count: Total number of pages being read by the caller. * * page_cache_sync_readahead() should be called when a cache miss happened: * it will submit the read. The readahead logic may decide to piggyback more * pages onto the read request if access patterns suggest it will improve * performance. */ static inline void page_cache_sync_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, mapping, index); page_cache_sync_ra(&ractl, ra, req_count); } /** * page_cache_async_readahead - file readahead for marked pages * @mapping: address_space which holds the pagecache and I/O vectors * @ra: file_ra_state which holds the readahead state * @file: Used by the filesystem for authentication. * @page: The page at @index which triggered the readahead call. * @index: Index of first page to be read. * @req_count: Total number of pages being read by the caller. * * page_cache_async_readahead() should be called when a page is used which * is marked as PageReadahead; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. */ static inline void page_cache_async_readahead(struct address_space *mapping, struct file_ra_state *ra, struct file *file, struct page *page, pgoff_t index, unsigned long req_count) { DEFINE_READAHEAD(ractl, file, mapping, index); page_cache_async_ra(&ractl, ra, page, req_count); } /** * readahead_page - Get the next page to read. * @rac: The current readahead request. * * Context: The page is locked and has an elevated refcount. The caller * should decreases the refcount once the page has been submitted for I/O * and unlock the page once all I/O to that page has completed. * Return: A pointer to the next page, or %NULL if we are done. */ static inline struct page *readahead_page(struct readahead_control *rac) { struct page *page; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; rac->_index += rac->_batch_count; if (!rac->_nr_pages) { rac->_batch_count = 0; return NULL; } page = xa_load(&rac->mapping->i_pages, rac->_index); VM_BUG_ON_PAGE(!PageLocked(page), page); rac->_batch_count = thp_nr_pages(page); return page; } static inline unsigned int __readahead_batch(struct readahead_control *rac, struct page **array, unsigned int array_sz) { unsigned int i = 0; XA_STATE(xas, &rac->mapping->i_pages, 0); struct page *page; BUG_ON(rac->_batch_count > rac->_nr_pages); rac->_nr_pages -= rac->_batch_count; rac->_index += rac->_batch_count; rac->_batch_count = 0; xas_set(&xas, rac->_index); rcu_read_lock(); xas_for_each(&xas, page, rac->_index + rac->_nr_pages - 1) { if (xas_retry(&xas, page)) continue; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageTail(page), page); array[i++] = page; rac->_batch_count += thp_nr_pages(page); /* * The page cache isn't using multi-index entries yet, * so the xas cursor needs to be manually moved to the * next index. This can be removed once the page cache * is converted. */ if (PageHead(page)) xas_set(&xas, rac->_index + rac->_batch_count); if (i == array_sz) break; } rcu_read_unlock(); return i; } /** * readahead_page_batch - Get a batch of pages to read. * @rac: The current readahead request. * @array: An array of pointers to struct page. * * Context: The pages are locked and have an elevated refcount. The caller * should decreases the refcount once the page has been submitted for I/O * and unlock the page once all I/O to that page has completed. * Return: The number of pages placed in the array. 0 indicates the request * is complete. */ #define readahead_page_batch(rac, array) \ __readahead_batch(rac, array, ARRAY_SIZE(array)) /** * readahead_pos - The byte offset into the file of this readahead request. * @rac: The readahead request. */ static inline loff_t readahead_pos(struct readahead_control *rac) { return (loff_t)rac->_index * PAGE_SIZE; } /** * readahead_length - The number of bytes in this readahead request. * @rac: The readahead request. */ static inline loff_t readahead_length(struct readahead_control *rac) { return (loff_t)rac->_nr_pages * PAGE_SIZE; } /** * readahead_index - The index of the first page in this readahead request. * @rac: The readahead request. */ static inline pgoff_t readahead_index(struct readahead_control *rac) { return rac->_index; } /** * readahead_count - The number of pages in this readahead request. * @rac: The readahead request. */ static inline unsigned int readahead_count(struct readahead_control *rac) { return rac->_nr_pages; } static inline unsigned long dir_pages(struct inode *inode) { return (unsigned long)(inode->i_size + PAGE_SIZE - 1) >> PAGE_SHIFT; } /** * page_mkwrite_check_truncate - check if page was truncated * @page: the page to check * @inode: the inode to check the page against * * Returns the number of bytes in the page up to EOF, * or -EFAULT if the page was truncated. */ static inline int page_mkwrite_check_truncate(struct page *page, struct inode *inode) { loff_t size = i_size_read(inode); pgoff_t index = size >> PAGE_SHIFT; int offset = offset_in_page(size); if (page->mapping != inode->i_mapping) return -EFAULT; /* page is wholly inside EOF */ if (page->index < index) return PAGE_SIZE; /* page is wholly past EOF */ if (page->index > index || !offset) return -EFAULT; /* page is partially inside EOF */ return offset; } /** * i_blocks_per_page - How many blocks fit in this page. * @inode: The inode which contains the blocks. * @page: The page (head page if the page is a THP). * * If the block size is larger than the size of this page, return zero. * * Context: The caller should hold a refcount on the page to prevent it * from being split. * Return: The number of filesystem blocks covered by this page. */ static inline unsigned int i_blocks_per_page(struct inode *inode, struct page *page) { return thp_size(page) >> inode->i_blkbits; } #endif /* _LINUX_PAGEMAP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM filemap #if !defined(_TRACE_FILEMAP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_FILEMAP_H #include <linux/types.h> #include <linux/tracepoint.h> #include <linux/mm.h> #include <linux/memcontrol.h> #include <linux/device.h> #include <linux/kdev_t.h> #include <linux/errseq.h> DECLARE_EVENT_CLASS(mm_filemap_op_page_cache, TP_PROTO(struct page *page), TP_ARGS(page), TP_STRUCT__entry( __field(unsigned long, pfn) __field(unsigned long, i_ino) __field(unsigned long, index) __field(dev_t, s_dev) ), TP_fast_assign( __entry->pfn = page_to_pfn(page); __entry->i_ino = page->mapping->host->i_ino; __entry->index = page->index; if (page->mapping->host->i_sb) __entry->s_dev = page->mapping->host->i_sb->s_dev; else __entry->s_dev = page->mapping->host->i_rdev; ), TP_printk("dev %d:%d ino %lx page=%p pfn=%lu ofs=%lu", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, pfn_to_page(__entry->pfn), __entry->pfn, __entry->index << PAGE_SHIFT) ); DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_delete_from_page_cache, TP_PROTO(struct page *page), TP_ARGS(page) ); DEFINE_EVENT(mm_filemap_op_page_cache, mm_filemap_add_to_page_cache, TP_PROTO(struct page *page), TP_ARGS(page) ); TRACE_EVENT(filemap_set_wb_err, TP_PROTO(struct address_space *mapping, errseq_t eseq), TP_ARGS(mapping, eseq), TP_STRUCT__entry( __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, errseq) ), TP_fast_assign( __entry->i_ino = mapping->host->i_ino; __entry->errseq = eseq; if (mapping->host->i_sb) __entry->s_dev = mapping->host->i_sb->s_dev; else __entry->s_dev = mapping->host->i_rdev; ), TP_printk("dev=%d:%d ino=0x%lx errseq=0x%x", MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->errseq) ); TRACE_EVENT(file_check_and_advance_wb_err, TP_PROTO(struct file *file, errseq_t old), TP_ARGS(file, old), TP_STRUCT__entry( __field(struct file *, file) __field(unsigned long, i_ino) __field(dev_t, s_dev) __field(errseq_t, old) __field(errseq_t, new) ), TP_fast_assign( __entry->file = file; __entry->i_ino = file->f_mapping->host->i_ino; if (file->f_mapping->host->i_sb) __entry->s_dev = file->f_mapping->host->i_sb->s_dev; else __entry->s_dev = file->f_mapping->host->i_rdev; __entry->old = old; __entry->new = file->f_wb_err; ), TP_printk("file=%p dev=%d:%d ino=0x%lx old=0x%x new=0x%x", __entry->file, MAJOR(__entry->s_dev), MINOR(__entry->s_dev), __entry->i_ino, __entry->old, __entry->new) ); #endif /* _TRACE_FILEMAP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 #ifndef _LINUX_HASH_H #define _LINUX_HASH_H /* Fast hashing routine for ints, longs and pointers. (C) 2002 Nadia Yvette Chambers, IBM */ #include <asm/types.h> #include <linux/compiler.h> /* * The "GOLDEN_RATIO_PRIME" is used in ifs/btrfs/brtfs_inode.h and * fs/inode.c. It's not actually prime any more (the previous primes * were actively bad for hashing), but the name remains. */ #if BITS_PER_LONG == 32 #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_32 #define hash_long(val, bits) hash_32(val, bits) #elif BITS_PER_LONG == 64 #define hash_long(val, bits) hash_64(val, bits) #define GOLDEN_RATIO_PRIME GOLDEN_RATIO_64 #else #error Wordsize not 32 or 64 #endif /* * This hash multiplies the input by a large odd number and takes the * high bits. Since multiplication propagates changes to the most * significant end only, it is essential that the high bits of the * product be used for the hash value. * * Chuck Lever verified the effectiveness of this technique: * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf * * Although a random odd number will do, it turns out that the golden * ratio phi = (sqrt(5)-1)/2, or its negative, has particularly nice * properties. (See Knuth vol 3, section 6.4, exercise 9.) * * These are the negative, (1 - phi) = phi**2 = (3 - sqrt(5))/2, * which is very slightly easier to multiply by and makes no * difference to the hash distribution. */ #define GOLDEN_RATIO_32 0x61C88647 #define GOLDEN_RATIO_64 0x61C8864680B583EBull #ifdef CONFIG_HAVE_ARCH_HASH /* This header may use the GOLDEN_RATIO_xx constants */ #include <asm/hash.h> #endif /* * The _generic versions exist only so lib/test_hash.c can compare * the arch-optimized versions with the generic. * * Note that if you change these, any <asm/hash.h> that aren't updated * to match need to have their HAVE_ARCH_* define values updated so the * self-test will not false-positive. */ #ifndef HAVE_ARCH__HASH_32 #define __hash_32 __hash_32_generic #endif static inline u32 __hash_32_generic(u32 val) { return val * GOLDEN_RATIO_32; } #ifndef HAVE_ARCH_HASH_32 #define hash_32 hash_32_generic #endif static inline u32 hash_32_generic(u32 val, unsigned int bits) { /* High bits are more random, so use them. */ return __hash_32(val) >> (32 - bits); } #ifndef HAVE_ARCH_HASH_64 #define hash_64 hash_64_generic #endif static __always_inline u32 hash_64_generic(u64 val, unsigned int bits) { #if BITS_PER_LONG == 64 /* 64x64-bit multiply is efficient on all 64-bit processors */ return val * GOLDEN_RATIO_64 >> (64 - bits); #else /* Hash 64 bits using only 32x32-bit multiply. */ return hash_32((u32)val ^ __hash_32(val >> 32), bits); #endif } static inline u32 hash_ptr(const void *ptr, unsigned int bits) { return hash_long((unsigned long)ptr, bits); } /* This really should be called fold32_ptr; it does no hashing to speak of. */ static inline u32 hash32_ptr(const void *ptr) { unsigned long val = (unsigned long)ptr; #if BITS_PER_LONG == 64 val ^= (val >> 32); #endif return (u32)val; } #endif /* _LINUX_HASH_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 /* SPDX-License-Identifier: GPL-2.0-only */ /* * fs/kernfs/kernfs-internal.h - kernfs internal header file * * Copyright (c) 2001-3 Patrick Mochel * Copyright (c) 2007 SUSE Linux Products GmbH * Copyright (c) 2007, 2013 Tejun Heo <teheo@suse.de> */ #ifndef __KERNFS_INTERNAL_H #define __KERNFS_INTERNAL_H #include <linux/lockdep.h> #include <linux/fs.h> #include <linux/mutex.h> #include <linux/xattr.h> #include <linux/kernfs.h> #include <linux/fs_context.h> struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; struct timespec64 ia_atime; struct timespec64 ia_mtime; struct timespec64 ia_ctime; struct simple_xattrs xattrs; atomic_t nr_user_xattrs; atomic_t user_xattr_size; }; /* +1 to avoid triggering overflow warning when negating it */ #define KN_DEACTIVATED_BIAS (INT_MIN + 1) /* KERNFS_TYPE_MASK and types are defined in include/linux/kernfs.h */ /** * kernfs_root - find out the kernfs_root a kernfs_node belongs to * @kn: kernfs_node of interest * * Return the kernfs_root @kn belongs to. */ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) { /* if parent exists, it's always a dir; otherwise, @sd is a dir */ if (kn->parent) kn = kn->parent; return kn->dir.root; } /* * mount.c */ struct kernfs_super_info { struct super_block *sb; /* * The root associated with this super_block. Each super_block is * identified by the root and ns it's associated with. */ struct kernfs_root *root; /* * Each sb is associated with one namespace tag, currently the * network namespace of the task which mounted this kernfs * instance. If multiple tags become necessary, make the following * an array and compare kernfs_node tag against every entry. */ const void *ns; /* anchored at kernfs_root->supers, protected by kernfs_mutex */ struct list_head node; }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) { if (d_really_is_negative(dentry)) return NULL; return d_inode(dentry)->i_private; } extern const struct super_operations kernfs_sops; extern struct kmem_cache *kernfs_node_cache, *kernfs_iattrs_cache; /* * inode.c */ extern const struct xattr_handler *kernfs_xattr_handlers[]; void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); int kernfs_iop_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size); int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr); /* * dir.c */ extern struct mutex kernfs_mutex; extern const struct dentry_operations kernfs_dops; extern const struct file_operations kernfs_dir_fops; extern const struct inode_operations kernfs_dir_iops; struct kernfs_node *kernfs_get_active(struct kernfs_node *kn); void kernfs_put_active(struct kernfs_node *kn); int kernfs_add_one(struct kernfs_node *kn); struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, const char *name, umode_t mode, kuid_t uid, kgid_t gid, unsigned flags); /* * file.c */ extern const struct file_operations kernfs_file_fops; void kernfs_drain_open_files(struct kernfs_node *kn); /* * symlink.c */ extern const struct inode_operations kernfs_symlink_iops; #endif /* __KERNFS_INTERNAL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_PERCPU_RWSEM_H #define _LINUX_PERCPU_RWSEM_H #include <linux/atomic.h> #include <linux/percpu.h> #include <linux/rcuwait.h> #include <linux/wait.h> #include <linux/rcu_sync.h> #include <linux/lockdep.h> struct percpu_rw_semaphore { struct rcu_sync rss; unsigned int __percpu *read_count; struct rcuwait writer; wait_queue_head_t waiters; atomic_t block; #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif }; #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname }, #else #define __PERCPU_RWSEM_DEP_MAP_INIT(lockname) #endif #define __DEFINE_PERCPU_RWSEM(name, is_static) \ static DEFINE_PER_CPU(unsigned int, __percpu_rwsem_rc_##name); \ is_static struct percpu_rw_semaphore name = { \ .rss = __RCU_SYNC_INITIALIZER(name.rss), \ .read_count = &__percpu_rwsem_rc_##name, \ .writer = __RCUWAIT_INITIALIZER(name.writer), \ .waiters = __WAIT_QUEUE_HEAD_INITIALIZER(name.waiters), \ .block = ATOMIC_INIT(0), \ __PERCPU_RWSEM_DEP_MAP_INIT(name) \ } #define DEFINE_PERCPU_RWSEM(name) \ __DEFINE_PERCPU_RWSEM(name, /* not static */) #define DEFINE_STATIC_PERCPU_RWSEM(name) \ __DEFINE_PERCPU_RWSEM(name, static) extern bool __percpu_down_read(struct percpu_rw_semaphore *, bool); static inline void percpu_down_read(struct percpu_rw_semaphore *sem) { might_sleep(); rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); preempt_disable(); /* * We are in an RCU-sched read-side critical section, so the writer * cannot both change sem->state from readers_fast and start checking * counters while we are here. So if we see !sem->state, we know that * the writer won't be checking until we're past the preempt_enable() * and that once the synchronize_rcu() is done, the writer will see * anything we did within this RCU-sched read-size critical section. */ if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else __percpu_down_read(sem, false); /* Unconditional memory barrier */ /* * The preempt_enable() prevents the compiler from * bleeding the critical section out. */ preempt_enable(); } static inline bool percpu_down_read_trylock(struct percpu_rw_semaphore *sem) { bool ret = true; preempt_disable(); /* * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) this_cpu_inc(*sem->read_count); else ret = __percpu_down_read(sem, true); /* Unconditional memory barrier */ preempt_enable(); /* * The barrier() from preempt_enable() prevents the compiler from * bleeding the critical section out. */ if (ret) rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_); return ret; } static inline void percpu_up_read(struct percpu_rw_semaphore *sem) { rwsem_release(&sem->dep_map, _RET_IP_); preempt_disable(); /* * Same as in percpu_down_read(). */ if (likely(rcu_sync_is_idle(&sem->rss))) { this_cpu_dec(*sem->read_count); } else { /* * slowpath; reader will only ever wake a single blocked * writer. */ smp_mb(); /* B matches C */ /* * In other words, if they see our decrement (presumably to * aggregate zero, as that is the only time it matters) they * will also see our critical section. */ this_cpu_dec(*sem->read_count); rcuwait_wake_up(&sem->writer); } preempt_enable(); } extern void percpu_down_write(struct percpu_rw_semaphore *); extern void percpu_up_write(struct percpu_rw_semaphore *); extern int __percpu_init_rwsem(struct percpu_rw_semaphore *, const char *, struct lock_class_key *); extern void percpu_free_rwsem(struct percpu_rw_semaphore *); #define percpu_init_rwsem(sem) \ ({ \ static struct lock_class_key rwsem_key; \ __percpu_init_rwsem(sem, #sem, &rwsem_key); \ }) #define percpu_rwsem_is_held(sem) lockdep_is_held(sem) #define percpu_rwsem_assert_held(sem) lockdep_assert_held(sem) static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, bool read, unsigned long ip) { lock_release(&sem->dep_map, ip); } static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, bool read, unsigned long ip) { lock_acquire(&sem->dep_map, 0, 1, read, 1, NULL, ip); } #endif
1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <asm/processor.h> #include <linux/cpumask.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ulong2long(a) (*(long *)(&(a))) #define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) #define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() (current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { preempt_enable(); rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active __read_mostly; void rcu_sched_clock_irq(int user); void rcu_report_dead(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void rcu_init_tasks_generic(void); #else static inline void rcu_init_tasks_generic(void) { } #endif #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #ifdef CONFIG_NO_HZ_FULL void rcu_user_enter(void); void rcu_user_exit(void); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } #endif /* CONFIG_NO_HZ_FULL */ #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); void rcu_nocb_flush_deferred_wakeup(void); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers * @a: Code that RCU needs to pay attention to. * * RCU read-side critical sections are forbidden in the inner idle loop, * that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU * will happily ignore any such read-side critical sections. However, * things like powertop need tracepoints in the inner idle loop. * * This macro provides the way out: RCU_NONIDLE(do_something_with_RCU()) * will tell RCU that it needs to pay attention, invoke its argument * (in this example, calling the do_something_with_RCU() function), * and then tell RCU to go back to ignoring this CPU. It is permissible * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is * on the order of a million or so, even on 32-bit systems). It is * not legal to block within RCU_NONIDLE(), nor is it permissible to * transfer control either into or out of RCU_NONIDLE()'s statement. */ #define RCU_NONIDLE(a) \ do { \ rcu_irq_enter_irqson(); \ do { a; } while (0); \ rcu_irq_exit_irqson(); \ } while (0) /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU # define rcu_tasks_trace_qs(t) \ do { \ if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ !unlikely(READ_ONCE((t)->trc_reader_nesting))) { \ smp_store_release(&(t)->trc_reader_checked, true); \ smp_mb(); /* Readers partitioned by store. */ \ } \ } while (0) # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs((t)); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data.unlikely") __warned; \ if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU) static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else /* #ifdef CONFIG_PROVE_RCU */ static inline void rcu_preempt_sleep_check(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0) #define rcu_sleep_check() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as * is the case in the context of the RCU callback that is freeing up * the data, or after a synchronize_rcu() returns. This can be useful * when tearing down multi-linked structures after a grace period * has elapsed. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In most situations, rcu_read_unlock() is immune from deadlock. * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock() * is responsible for deboosting, which it does via rt_mutex_unlock(). * Unfortunately, this function acquires the scheduler's runqueue and * priority-inheritance spinlocks. This means that deadlock could result * if the caller of rcu_read_unlock() already holds one of these locks or * any lock that is ever acquired while holding them. * * That said, RCU readers are never priority boosted unless they were * preempted. Therefore, one way to avoid deadlock is to make sure * that preemption never happens within any RCU read-side critical * section whose outermost rcu_read_unlock() is called with one of * rt_mutex_unlock()'s locks held. Such preemption can be avoided in * a number of ways, for example, by invoking preempt_disable() before * critical section's outermost rcu_read_lock(). * * Given that the set of locks acquired by rt_mutex_unlock() might change * at any time, a somewhat more future-proofed approach is to make sure * that that preemption never happens within any RCU read-side critical * section whose outermost rcu_read_unlock() is called with irqs disabled. * This approach relies on the fact that rt_mutex_unlock() currently only * acquires irq-disabled locks. * * The second of these two approaches is best in most situations, * however, the first approach can also be useful, at least to those * developers willing to keep abreast of the set of locks acquired by * rt_mutex_unlock(). * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); __release(RCU); __rcu_read_unlock(); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent of rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as * an RCU read-side critical section. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent of rcu_read_lock(), but disables preemption. * Read-side critical sections can also be introduced by anything else * that disables preemption, including local_irq_disable() and friends. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /* * Does the specified offset indicate that the corresponding rcu_head * structure can be handled by kvfree_rcu()? */ #define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /* * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. */ #define __kvfree_rcu(head, offset) \ do { \ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ } while (0) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. Rather than encoding a * function address in the embedded rcu_head structure, kfree_rcu() instead * encodes the offset of the rcu_head structure within the base structure. * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in __kvfree_rcu(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * Note that the allowable offset might decrease in the future, for example, * to allow something like kmem_cache_free_rcu(). * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) \ __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ } while (0) /** * kvfree_rcu() - kvfree an object after a grace period. * * This macro consists of one or two arguments and it is * based on whether an object is head-less or not. If it * has a head then a semantic stays the same as it used * to be before: * * kvfree_rcu(ptr, rhf); * * where @ptr is a pointer to kvfree(), @rhf is the name * of the rcu_head structure within the type of @ptr. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kvfree_rcu(ptr); * * where @ptr is a pointer to kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME #define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; #endif /* __LINUX_RCUPDATE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _FIB_LOOKUP_H #define _FIB_LOOKUP_H #include <linux/types.h> #include <linux/list.h> #include <net/ip_fib.h> #include <net/nexthop.h> struct fib_alias { struct hlist_node fa_list; struct fib_info *fa_info; u8 fa_tos; u8 fa_type; u8 fa_state; u8 fa_slen; u32 tb_id; s16 fa_default; u8 offload:1, trap:1, unused:6; struct rcu_head rcu; }; #define FA_S_ACCESSED 0x01 /* Dont write on fa_state unless needed, to keep it shared on all cpus */ static inline void fib_alias_accessed(struct fib_alias *fa) { if (!(fa->fa_state & FA_S_ACCESSED)) fa->fa_state |= FA_S_ACCESSED; } /* Exported by fib_semantics.c */ void fib_release_info(struct fib_info *); struct fib_info *fib_create_info(struct fib_config *cfg, struct netlink_ext_ack *extack); int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, struct netlink_ext_ack *extack); bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi); int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, struct fib_rt_info *fri, unsigned int flags); void rtmsg_fib(int event, __be32 key, struct fib_alias *fa, int dst_len, u32 tb_id, const struct nl_info *info, unsigned int nlm_flags); static inline void fib_result_assign(struct fib_result *res, struct fib_info *fi) { /* we used to play games with refcounts, but we now use RCU */ res->fi = fi; res->nhc = fib_info_nhc(fi, 0); } struct fib_prop { int error; u8 scope; }; extern const struct fib_prop fib_props[RTN_MAX + 1]; #endif /* _FIB_LOOKUP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_RMAP_H #define _LINUX_RMAP_H /* * Declarations for Reverse Mapping functions in mm/rmap.c */ #include <linux/list.h> #include <linux/slab.h> #include <linux/mm.h> #include <linux/rwsem.h> #include <linux/memcontrol.h> #include <linux/highmem.h> /* * The anon_vma heads a list of private "related" vmas, to scan if * an anonymous page pointing to this anon_vma needs to be unmapped: * the vmas on the list will be related by forking, or by splitting. * * Since vmas come and go as they are split and merged (particularly * in mprotect), the mapping field of an anonymous page cannot point * directly to a vma: instead it points to an anon_vma, on whose list * the related vmas can be easily linked or unlinked. * * After unlinking the last vma on the list, we must garbage collect * the anon_vma object itself: we're guaranteed no page can be * pointing to this anon_vma once its vma list is empty. */ struct anon_vma { struct anon_vma *root; /* Root of this anon_vma tree */ struct rw_semaphore rwsem; /* W: modification, R: walking the list */ /* * The refcount is taken on an anon_vma when there is no * guarantee that the vma of page tables will exist for * the duration of the operation. A caller that takes * the reference is responsible for clearing up the * anon_vma if they are the last user on release */ atomic_t refcount; /* * Count of child anon_vmas and VMAs which points to this anon_vma. * * This counter is used for making decision about reusing anon_vma * instead of forking new one. See comments in function anon_vma_clone. */ unsigned degree; struct anon_vma *parent; /* Parent of this anon_vma */ /* * NOTE: the LSB of the rb_root.rb_node is set by * mm_take_all_locks() _after_ taking the above lock. So the * rb_root must only be read/written after taking the above lock * to be sure to see a valid next pointer. The LSB bit itself * is serialized by a system wide lock only visible to * mm_take_all_locks() (mm_all_locks_mutex). */ /* Interval tree of private "related" vmas */ struct rb_root_cached rb_root; }; /* * The copy-on-write semantics of fork mean that an anon_vma * can become associated with multiple processes. Furthermore, * each child process will have its own anon_vma, where new * pages for that process are instantiated. * * This structure allows us to find the anon_vmas associated * with a VMA, or the VMAs associated with an anon_vma. * The "same_vma" list contains the anon_vma_chains linking * all the anon_vmas associated with this VMA. * The "rb" field indexes on an interval tree the anon_vma_chains * which link all the VMAs associated with this anon_vma. */ struct anon_vma_chain { struct vm_area_struct *vma; struct anon_vma *anon_vma; struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ struct rb_node rb; /* locked by anon_vma->rwsem */ unsigned long rb_subtree_last; #ifdef CONFIG_DEBUG_VM_RB unsigned long cached_vma_start, cached_vma_last; #endif }; enum ttu_flags { TTU_MIGRATION = 0x1, /* migration mode */ TTU_MUNLOCK = 0x2, /* munlock mode */ TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ TTU_IGNORE_HWPOISON = 0x20, /* corrupted page is recoverable */ TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible * and caller guarantees they will * do a final flush if necessary */ TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: * caller holds it */ TTU_SPLIT_FREEZE = 0x100, /* freeze pte under splitting thp */ }; #ifdef CONFIG_MMU static inline void get_anon_vma(struct anon_vma *anon_vma) { atomic_inc(&anon_vma->refcount); } void __put_anon_vma(struct anon_vma *anon_vma); static inline void put_anon_vma(struct anon_vma *anon_vma) { if (atomic_dec_and_test(&anon_vma->refcount)) __put_anon_vma(anon_vma); } static inline void anon_vma_lock_write(struct anon_vma *anon_vma) { down_write(&anon_vma->root->rwsem); } static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) { up_write(&anon_vma->root->rwsem); } static inline void anon_vma_lock_read(struct anon_vma *anon_vma) { down_read(&anon_vma->root->rwsem); } static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) { up_read(&anon_vma->root->rwsem); } /* * anon_vma helper functions. */ void anon_vma_init(void); /* create anon_vma_cachep */ int __anon_vma_prepare(struct vm_area_struct *); void unlink_anon_vmas(struct vm_area_struct *); int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); static inline int anon_vma_prepare(struct vm_area_struct *vma) { if (likely(vma->anon_vma)) return 0; return __anon_vma_prepare(vma); } static inline void anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) { VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); unlink_anon_vmas(next); } struct anon_vma *page_get_anon_vma(struct page *page); /* bitflags for do_page_add_anon_rmap() */ #define RMAP_EXCLUSIVE 0x01 #define RMAP_COMPOUND 0x02 /* * rmap interfaces called when adding or removing pte of page */ void page_move_anon_rmap(struct page *, struct vm_area_struct *); void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); void do_page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, int); void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long, bool); void page_add_file_rmap(struct page *, bool); void page_remove_rmap(struct page *, bool); void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); void hugepage_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long); static inline void page_dup_rmap(struct page *page, bool compound) { atomic_inc(compound ? compound_mapcount_ptr(page) : &page->_mapcount); } /* * Called from mm/vmscan.c to handle paging out */ int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); bool try_to_unmap(struct page *, enum ttu_flags flags); /* Avoid racy checks */ #define PVMW_SYNC (1 << 0) /* Look for migarion entries rather than present PTEs */ #define PVMW_MIGRATION (1 << 1) struct page_vma_mapped_walk { struct page *page; struct vm_area_struct *vma; unsigned long address; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; unsigned int flags; }; static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) { /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ if (pvmw->pte && !PageHuge(pvmw->page)) pte_unmap(pvmw->pte); if (pvmw->ptl) spin_unlock(pvmw->ptl); } bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); /* * Used by swapoff to help locate where page is expected in vma. */ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); /* * Cleans the PTEs of shared mappings. * (and since clean PTEs should also be readonly, write protects them too) * * returns the number of cleaned PTEs. */ int page_mkclean(struct page *); /* * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ void try_to_munlock(struct page *); void remove_migration_ptes(struct page *old, struct page *new, bool locked); /* * Called by memory-failure.c to kill processes. */ struct anon_vma *page_lock_anon_vma_read(struct page *page); void page_unlock_anon_vma_read(struct anon_vma *anon_vma); int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); /* * rmap_walk_control: To control rmap traversing for specific needs * * arg: passed to rmap_one() and invalid_vma() * rmap_one: executed on each vma where page is mapped * done: for checking traversing termination condition * anon_lock: for getting anon_lock by optimized way rather than default * invalid_vma: for skipping uninterested vma */ struct rmap_walk_control { void *arg; /* * Return false if page table scanning in rmap_walk should be stopped. * Otherwise, return true. */ bool (*rmap_one)(struct page *page, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct page *page); struct anon_vma *(*anon_lock)(struct page *page); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; void rmap_walk(struct page *page, struct rmap_walk_control *rwc); void rmap_walk_locked(struct page *page, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ #define anon_vma_init() do {} while (0) #define anon_vma_prepare(vma) (0) #define anon_vma_link(vma) do {} while (0) static inline int page_referenced(struct page *page, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags) { *vm_flags = 0; return 0; } #define try_to_unmap(page, refs) false static inline int page_mkclean(struct page *page) { return 0; } #endif /* CONFIG_MMU */ #endif /* _LINUX_RMAP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _NETFILTER_INGRESS_H_ #define _NETFILTER_INGRESS_H_ #include <linux/netfilter.h> #include <linux/netdevice.h> #ifdef CONFIG_NETFILTER_INGRESS static inline bool nf_hook_ingress_active(const struct sk_buff *skb) { #ifdef CONFIG_JUMP_LABEL if (!static_key_false(&nf_hooks_needed[NFPROTO_NETDEV][NF_NETDEV_INGRESS])) return false; #endif return rcu_access_pointer(skb->dev->nf_hooks_ingress); } /* caller must hold rcu_read_lock */ static inline int nf_hook_ingress(struct sk_buff *skb) { struct nf_hook_entries *e = rcu_dereference(skb->dev->nf_hooks_ingress); struct nf_hook_state state; int ret; /* Must recheck the ingress hook head, in the event it became NULL * after the check in nf_hook_ingress_active evaluated to true. */ if (unlikely(!e)) return 0; nf_hook_state_init(&state, NF_NETDEV_INGRESS, NFPROTO_NETDEV, skb->dev, NULL, NULL, dev_net(skb->dev), NULL); ret = nf_hook_slow(skb, &state, e, 0); if (ret == 0) return -1; return ret; } static inline void nf_hook_ingress_init(struct net_device *dev) { RCU_INIT_POINTER(dev->nf_hooks_ingress, NULL); } #else /* CONFIG_NETFILTER_INGRESS */ static inline int nf_hook_ingress_active(struct sk_buff *skb) { return 0; } static inline int nf_hook_ingress(struct sk_buff *skb) { return 0; } static inline void nf_hook_ingress_init(struct net_device *dev) {} #endif /* CONFIG_NETFILTER_INGRESS */ #endif /* _NETFILTER_INGRESS_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Berkeley style UIO structures - Alan Cox 1994. */ #ifndef __LINUX_UIO_H #define __LINUX_UIO_H #include <linux/kernel.h> #include <linux/thread_info.h> #include <uapi/linux/uio.h> struct page; struct pipe_inode_info; struct kvec { void *iov_base; /* and that should *never* hold a userland pointer */ size_t iov_len; }; enum iter_type { /* iter types */ ITER_IOVEC = 4, ITER_KVEC = 8, ITER_BVEC = 16, ITER_PIPE = 32, ITER_DISCARD = 64, }; struct iov_iter { /* * Bit 0 is the read/write bit, set if we're writing. * Bit 1 is the BVEC_FLAG_NO_REF bit, set if type is a bvec and * the caller isn't expecting to drop a page reference when done. */ unsigned int type; size_t iov_offset; size_t count; union { const struct iovec *iov; const struct kvec *kvec; const struct bio_vec *bvec; struct pipe_inode_info *pipe; }; union { unsigned long nr_segs; struct { unsigned int head; unsigned int start_head; }; }; }; static inline enum iter_type iov_iter_type(const struct iov_iter *i) { return i->type & ~(READ | WRITE); } static inline bool iter_is_iovec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_IOVEC; } static inline bool iov_iter_is_kvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_KVEC; } static inline bool iov_iter_is_bvec(const struct iov_iter *i) { return iov_iter_type(i) == ITER_BVEC; } static inline bool iov_iter_is_pipe(const struct iov_iter *i) { return iov_iter_type(i) == ITER_PIPE; } static inline bool iov_iter_is_discard(const struct iov_iter *i) { return iov_iter_type(i) == ITER_DISCARD; } static inline unsigned char iov_iter_rw(const struct iov_iter *i) { return i->type & (READ | WRITE); } /* * Total number of bytes covered by an iovec. * * NOTE that it is not safe to use this function until all the iovec's * segment lengths have been validated. Because the individual lengths can * overflow a size_t when added together. */ static inline size_t iov_length(const struct iovec *iov, unsigned long nr_segs) { unsigned long seg; size_t ret = 0; for (seg = 0; seg < nr_segs; seg++) ret += iov[seg].iov_len; return ret; } static inline struct iovec iov_iter_iovec(const struct iov_iter *iter) { return (struct iovec) { .iov_base = iter->iov->iov_base + iter->iov_offset, .iov_len = min(iter->count, iter->iov->iov_len - iter->iov_offset), }; } size_t iov_iter_copy_from_user_atomic(struct page *page, struct iov_iter *i, unsigned long offset, size_t bytes); void iov_iter_advance(struct iov_iter *i, size_t bytes); void iov_iter_revert(struct iov_iter *i, size_t bytes); int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes); size_t iov_iter_single_seg_count(const struct iov_iter *i); size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes, struct iov_iter *i); size_t _copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter(void *addr, size_t bytes, struct iov_iter *i); bool _copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i); size_t _copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i); bool _copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i); static __always_inline __must_check size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, true))) return 0; else return _copy_to_iter(addr, bytes, i); } static __always_inline __must_check size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return 0; else return _copy_from_iter(addr, bytes, i); } static __always_inline __must_check bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return false; else return _copy_from_iter_full(addr, bytes, i); } static __always_inline __must_check size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return 0; else return _copy_from_iter_nocache(addr, bytes, i); } static __always_inline __must_check bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return false; else return _copy_from_iter_full_nocache(addr, bytes, i); } #ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE /* * Note, users like pmem that depend on the stricter semantics of * copy_from_iter_flushcache() than copy_from_iter_nocache() must check for * IS_ENABLED(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) before assuming that the * destination is flushed from the cache on return. */ size_t _copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_from_iter_flushcache _copy_from_iter_nocache #endif #ifdef CONFIG_ARCH_HAS_COPY_MC size_t _copy_mc_to_iter(const void *addr, size_t bytes, struct iov_iter *i); #else #define _copy_mc_to_iter _copy_to_iter #endif static __always_inline __must_check size_t copy_from_iter_flushcache(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, false))) return 0; else return _copy_from_iter_flushcache(addr, bytes, i); } static __always_inline __must_check size_t copy_mc_to_iter(void *addr, size_t bytes, struct iov_iter *i) { if (unlikely(!check_copy_size(addr, bytes, true))) return 0; else return _copy_mc_to_iter(addr, bytes, i); } size_t iov_iter_zero(size_t bytes, struct iov_iter *); unsigned long iov_iter_alignment(const struct iov_iter *i); unsigned long iov_iter_gap_alignment(const struct iov_iter *i); void iov_iter_init(struct iov_iter *i, unsigned int direction, const struct iovec *iov, unsigned long nr_segs, size_t count); void iov_iter_kvec(struct iov_iter *i, unsigned int direction, const struct kvec *kvec, unsigned long nr_segs, size_t count); void iov_iter_bvec(struct iov_iter *i, unsigned int direction, const struct bio_vec *bvec, unsigned long nr_segs, size_t count); void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode_info *pipe, size_t count); void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count); ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages, size_t maxsize, unsigned maxpages, size_t *start); ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, struct page ***pages, size_t maxsize, size_t *start); int iov_iter_npages(const struct iov_iter *i, int maxpages); const void *dup_iter(struct iov_iter *new, struct iov_iter *old, gfp_t flags); static inline size_t iov_iter_count(const struct iov_iter *i) { return i->count; } /* * Cap the iov_iter by given limit; note that the second argument is * *not* the new size - it's upper limit for such. Passing it a value * greater than the amount of data in iov_iter is fine - it'll just do * nothing in that case. */ static inline void iov_iter_truncate(struct iov_iter *i, u64 count) { /* * count doesn't have to fit in size_t - comparison extends both * operands to u64 here and any value that would be truncated by * conversion in assignement is by definition greater than all * values of size_t, including old i->count. */ if (i->count > count) i->count = count; } /* * reexpand a previously truncated iterator; count must be no more than how much * we had shrunk it. */ static inline void iov_iter_reexpand(struct iov_iter *i, size_t count) { i->count = count; } struct csum_state { __wsum csum; size_t off; }; size_t csum_and_copy_to_iter(const void *addr, size_t bytes, void *csstate, struct iov_iter *i); size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); bool csum_and_copy_from_iter_full(void *addr, size_t bytes, __wsum *csum, struct iov_iter *i); size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, struct iov_iter *i); struct iovec *iovec_from_user(const struct iovec __user *uvector, unsigned long nr_segs, unsigned long fast_segs, struct iovec *fast_iov, bool compat); ssize_t import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i); ssize_t __import_iovec(int type, const struct iovec __user *uvec, unsigned nr_segs, unsigned fast_segs, struct iovec **iovp, struct iov_iter *i, bool compat); int import_single_range(int type, void __user *buf, size_t len, struct iovec *iov, struct iov_iter *i); int iov_iter_for_each_range(struct iov_iter *i, size_t bytes, int (*f)(struct kvec *vec, void *context), void *context); #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_RTNETLINK_H #define __LINUX_RTNETLINK_H #include <linux/mutex.h> #include <linux/netdevice.h> #include <linux/wait.h> #include <linux/refcount.h> #include <uapi/linux/rtnetlink.h> extern int rtnetlink_send(struct sk_buff *skb, struct net *net, u32 pid, u32 group, int echo); extern int rtnl_unicast(struct sk_buff *skb, struct net *net, u32 pid); extern void rtnl_notify(struct sk_buff *skb, struct net *net, u32 pid, u32 group, struct nlmsghdr *nlh, gfp_t flags); extern void rtnl_set_sk_err(struct net *net, u32 group, int error); extern int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics); extern int rtnl_put_cacheinfo(struct sk_buff *skb, struct dst_entry *dst, u32 id, long expires, u32 error); void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change, gfp_t flags); void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, gfp_t flags, int *new_nsid, int new_ifindex); struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned change, u32 event, gfp_t flags, int *new_nsid, int new_ifindex); void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags); /* RTNL is used as a global lock for all changes to network configuration */ extern void rtnl_lock(void); extern void rtnl_unlock(void); extern int rtnl_trylock(void); extern int rtnl_is_locked(void); extern int rtnl_lock_killable(void); extern bool refcount_dec_and_rtnl_lock(refcount_t *r); extern wait_queue_head_t netdev_unregistering_wq; extern struct rw_semaphore pernet_ops_rwsem; extern struct rw_semaphore net_rwsem; #ifdef CONFIG_PROVE_LOCKING extern bool lockdep_rtnl_is_held(void); #else static inline bool lockdep_rtnl_is_held(void) { return true; } #endif /* #ifdef CONFIG_PROVE_LOCKING */ /** * rcu_dereference_rtnl - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * * Do an rcu_dereference(p), but check caller either holds rcu_read_lock() * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference() */ #define rcu_dereference_rtnl(p) \ rcu_dereference_check(p, lockdep_rtnl_is_held()) /** * rcu_dereference_bh_rtnl - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereference * * Do an rcu_dereference_bh(p), but check caller either holds rcu_read_lock_bh() * or RTNL. Note : Please prefer rtnl_dereference() or rcu_dereference_bh() */ #define rcu_dereference_bh_rtnl(p) \ rcu_dereference_bh_check(p, lockdep_rtnl_is_held()) /** * rtnl_dereference - fetch RCU pointer when updates are prevented by RTNL * @p: The pointer to read, prior to dereferencing * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(), because caller holds RTNL. */ #define rtnl_dereference(p) \ rcu_dereference_protected(p, lockdep_rtnl_is_held()) static inline struct netdev_queue *dev_ingress_queue(struct net_device *dev) { return rtnl_dereference(dev->ingress_queue); } static inline struct netdev_queue *dev_ingress_queue_rcu(struct net_device *dev) { return rcu_dereference(dev->ingress_queue); } struct netdev_queue *dev_ingress_queue_create(struct net_device *dev); #ifdef CONFIG_NET_INGRESS void net_inc_ingress_queue(void); void net_dec_ingress_queue(void); #endif #ifdef CONFIG_NET_EGRESS void net_inc_egress_queue(void); void net_dec_egress_queue(void); #endif void rtnetlink_init(void); void __rtnl_unlock(void); void rtnl_kfree_skbs(struct sk_buff *head, struct sk_buff *tail); #define ASSERT_RTNL() \ WARN_ONCE(!rtnl_is_locked(), \ "RTNL: assertion failed at %s (%d)\n", __FILE__, __LINE__) extern int ndo_dflt_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev, struct net_device *filter_dev, int *idx); extern int ndo_dflt_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid, u16 flags); extern int ndo_dflt_fdb_del(struct ndmsg *ndm, struct nlattr *tb[], struct net_device *dev, const unsigned char *addr, u16 vid); extern int ndo_dflt_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq, struct net_device *dev, u16 mode, u32 flags, u32 mask, int nlflags, u32 filter_mask, int (*vlan_fill)(struct sk_buff *skb, struct net_device *dev, u32 filter_mask)); #endif /* __LINUX_RTNETLINK_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UIDGID_H #define _LINUX_UIDGID_H /* * A set of types for the internal kernel types representing uids and gids. * * The types defined in this header allow distinguishing which uids and gids in * the kernel are values used by userspace and which uid and gid values are * the internal kernel values. With the addition of user namespaces the values * can be different. Using the type system makes it possible for the compiler * to detect when we overlook these differences. * */ #include <linux/types.h> #include <linux/highuid.h> struct user_namespace; extern struct user_namespace init_user_ns; typedef struct { uid_t val; } kuid_t; typedef struct { gid_t val; } kgid_t; #define KUIDT_INIT(value) (kuid_t){ value } #define KGIDT_INIT(value) (kgid_t){ value } #ifdef CONFIG_MULTIUSER static inline uid_t __kuid_val(kuid_t uid) { return uid.val; } static inline gid_t __kgid_val(kgid_t gid) { return gid.val; } #else static inline uid_t __kuid_val(kuid_t uid) { return 0; } static inline gid_t __kgid_val(kgid_t gid) { return 0; } #endif #define GLOBAL_ROOT_UID KUIDT_INIT(0) #define GLOBAL_ROOT_GID KGIDT_INIT(0) #define INVALID_UID KUIDT_INIT(-1) #define INVALID_GID KGIDT_INIT(-1) static inline bool uid_eq(kuid_t left, kuid_t right) { return __kuid_val(left) == __kuid_val(right); } static inline bool gid_eq(kgid_t left, kgid_t right) { return __kgid_val(left) == __kgid_val(right); } static inline bool uid_gt(kuid_t left, kuid_t right) { return __kuid_val(left) > __kuid_val(right); } static inline bool gid_gt(kgid_t left, kgid_t right) { return __kgid_val(left) > __kgid_val(right); } static inline bool uid_gte(kuid_t left, kuid_t right) { return __kuid_val(left) >= __kuid_val(right); } static inline bool gid_gte(kgid_t left, kgid_t right) { return __kgid_val(left) >= __kgid_val(right); } static inline bool uid_lt(kuid_t left, kuid_t right) { return __kuid_val(left) < __kuid_val(right); } static inline bool gid_lt(kgid_t left, kgid_t right) { return __kgid_val(left) < __kgid_val(right); } static inline bool uid_lte(kuid_t left, kuid_t right) { return __kuid_val(left) <= __kuid_val(right); } static inline bool gid_lte(kgid_t left, kgid_t right) { return __kgid_val(left) <= __kgid_val(right); } static inline bool uid_valid(kuid_t uid) { return __kuid_val(uid) != (uid_t) -1; } static inline bool gid_valid(kgid_t gid) { return __kgid_val(gid) != (gid_t) -1; } #ifdef CONFIG_USER_NS extern kuid_t make_kuid(struct user_namespace *from, uid_t uid); extern kgid_t make_kgid(struct user_namespace *from, gid_t gid); extern uid_t from_kuid(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid(struct user_namespace *to, kgid_t gid); extern uid_t from_kuid_munged(struct user_namespace *to, kuid_t uid); extern gid_t from_kgid_munged(struct user_namespace *to, kgid_t gid); static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return from_kuid(ns, uid) != (uid_t) -1; } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return from_kgid(ns, gid) != (gid_t) -1; } #else static inline kuid_t make_kuid(struct user_namespace *from, uid_t uid) { return KUIDT_INIT(uid); } static inline kgid_t make_kgid(struct user_namespace *from, gid_t gid) { return KGIDT_INIT(gid); } static inline uid_t from_kuid(struct user_namespace *to, kuid_t kuid) { return __kuid_val(kuid); } static inline gid_t from_kgid(struct user_namespace *to, kgid_t kgid) { return __kgid_val(kgid); } static inline uid_t from_kuid_munged(struct user_namespace *to, kuid_t kuid) { uid_t uid = from_kuid(to, kuid); if (uid == (uid_t)-1) uid = overflowuid; return uid; } static inline gid_t from_kgid_munged(struct user_namespace *to, kgid_t kgid) { gid_t gid = from_kgid(to, kgid); if (gid == (gid_t)-1) gid = overflowgid; return gid; } static inline bool kuid_has_mapping(struct user_namespace *ns, kuid_t uid) { return uid_valid(uid); } static inline bool kgid_has_mapping(struct user_namespace *ns, kgid_t gid) { return gid_valid(gid); } #endif /* CONFIG_USER_NS */ #endif /* _LINUX_UIDGID_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __ASM_GENERIC_GETORDER_H #define __ASM_GENERIC_GETORDER_H #ifndef __ASSEMBLY__ #include <linux/compiler.h> #include <linux/log2.h> /** * get_order - Determine the allocation order of a memory size * @size: The size for which to get the order * * Determine the allocation order of a particular sized block of memory. This * is on a logarithmic scale, where: * * 0 -> 2^0 * PAGE_SIZE and below * 1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1 * 2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1 * 3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1 * 4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1 * ... * * The order returned is used to find the smallest allocation granule required * to hold an object of the specified size. * * The result is undefined if the size is 0. */ static inline __attribute_const__ int get_order(unsigned long size) { if (__builtin_constant_p(size)) { if (!size) return BITS_PER_LONG - PAGE_SHIFT; if (size < (1UL << PAGE_SHIFT)) return 0; return ilog2((size) - 1) - PAGE_SHIFT + 1; } size--; size >>= PAGE_SHIFT; #if BITS_PER_LONG == 32 return fls(size); #else return fls64(size); #endif } #endif /* __ASSEMBLY__ */ #endif /* __ASM_GENERIC_GETORDER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Fast and scalable bitmaps. * * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #ifndef __LINUX_SCALE_BITMAP_H #define __LINUX_SCALE_BITMAP_H #include <linux/kernel.h> #include <linux/slab.h> struct seq_file; /** * struct sbitmap_word - Word in a &struct sbitmap. */ struct sbitmap_word { /** * @depth: Number of bits being used in @word/@cleared */ unsigned long depth; /** * @word: word holding free bits */ unsigned long word ____cacheline_aligned_in_smp; /** * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; /** * @swap_lock: Held while swapping word <-> cleared */ spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** * struct sbitmap - Scalable bitmap. * * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This * trades off higher memory usage for better scalability. */ struct sbitmap { /** * @depth: Number of bits used in the whole bitmap. */ unsigned int depth; /** * @shift: log2(number of bits used per word) */ unsigned int shift; /** * @map_nr: Number of words (cachelines) being used for the bitmap. */ unsigned int map_nr; /** * @map: Allocated bitmap. */ struct sbitmap_word *map; }; #define SBQ_WAIT_QUEUES 8 #define SBQ_WAKE_BATCH 8 /** * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { /** * @wait_cnt: Number of frees remaining before we wake up. */ atomic_t wait_cnt; /** * @wait: Wait queue. */ wait_queue_head_t wait; } ____cacheline_aligned_in_smp; /** * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free * bits. * * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to * avoid contention on the wait queue spinlock. This ensures that we don't hit a * scalability wall when we run out of free bits and have to start putting tasks * to sleep. */ struct sbitmap_queue { /** * @sb: Scalable bitmap. */ struct sbitmap sb; /* * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different * cachelines until the map is exhausted. */ unsigned int __percpu *alloc_hint; /** * @wake_batch: Number of bits which must be freed before we wake up any * waiters. */ unsigned int wake_batch; /** * @wake_index: Next wait queue in @ws to wake up. */ atomic_t wake_index; /** * @ws: Wait queues. */ struct sbq_wait_state *ws; /* * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; /** * @round_robin: Allocate bits in strict round-robin order. */ bool round_robin; /** * @min_shallow_depth: The minimum shallow depth which may be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). */ unsigned int min_shallow_depth; }; /** * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. * @sb: Bitmap to initialize. * @depth: Number of bits to allocate. * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if * given, a good default is chosen. * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node); /** * sbitmap_free() - Free memory used by a &struct sbitmap. * @sb: Bitmap to free. */ static inline void sbitmap_free(struct sbitmap *sb) { kfree(sb->map); sb->map = NULL; } /** * sbitmap_resize() - Resize a &struct sbitmap. * @sb: Bitmap to resize. * @depth: New number of bits to resize to. * * Doesn't reallocate anything. It's up to the caller to ensure that the new * depth doesn't exceed the depth that the sb was initialized with. */ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); /** * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. * @sb: Bitmap to allocate from. * @alloc_hint: Hint for where to start searching for a free bit. * @round_robin: If true, be stricter about allocation order; always allocate * starting from the last allocated bit. This is less efficient * than the default behavior (false). * * This operation provides acquire barrier semantics if it succeeds. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); /** * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, * limiting the depth used from each word. * @sb: Bitmap to allocate from. * @alloc_hint: Hint for where to start searching for a free bit. * @shallow_depth: The maximum number of bits to allocate from a single word. * * This rather specific operation allows for having multiple users with * different allocation limits. E.g., there can be a high-priority class that * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority * class can only allocate half of the total bits in the bitmap, preventing it * from starving out the high-priority class. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth); /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. * * Return: true if any bit in the bitmap is set, false otherwise. */ bool sbitmap_any_bit_set(const struct sbitmap *sb); #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. * * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ static inline void __sbitmap_for_each_set(struct sbitmap *sb, unsigned int start, sb_for_each_fn fn, void *data) { unsigned int index; unsigned int nr; unsigned int scanned = 0; if (start >= sb->depth) start = 0; index = SB_NR_TO_INDEX(sb, start); nr = SB_NR_TO_BIT(sb, start); while (scanned < sb->depth) { unsigned long word; unsigned int depth = min_t(unsigned int, sb->map[index].depth - nr, sb->depth - scanned); scanned += depth; word = sb->map[index].word & ~sb->map[index].cleared; if (!word) goto next; /* * On the first iteration of the outer loop, we need to add the * bit offset back to the size of the word for find_next_bit(). * On all other iterations, nr is zero, so this is a noop. */ depth += nr; while (1) { nr = find_next_bit(&word, depth, nr); if (nr >= depth) break; if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } next: nr = 0; if (++index >= sb->map_nr) index = 0; } } /** * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. */ static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, void *data) { __sbitmap_for_each_set(sb, 0, fn, data); } static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) { return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; } /* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) { set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) { clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) { unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; set_bit(SB_NR_TO_BIT(sb, bitnr), addr); } static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, unsigned int bitnr) { clear_bit_unlock(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) { return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /** * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct * seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The output isn't guaranteed to be internally * consistent. */ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. * @sbq: Bitmap queue to initialize. * @depth: See sbitmap_init_node(). * @shift: See sbitmap_init_node(). * @round_robin: See sbitmap_get(). * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node); /** * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. * * @sbq: Bitmap queue to free. */ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) { kfree(sbq->ws); free_percpu(sbq->alloc_hint); sbitmap_free(&sbq->sb); } /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. * @depth: New number of bits to resize to. * * Like sbitmap_resize(), this doesn't reallocate anything. It has to do * some extra work on the &struct sbitmap_queue, so it's not safe to just * resize the underlying &struct sbitmap. */ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); /** * __sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue with preemption already disabled. * @sbq: Bitmap queue to allocate from. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); /** * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption * already disabled. * @sbq: Bitmap queue to allocate from. * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth); /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, unsigned int *cpu) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get(sbq); put_cpu(); return nr; } /** * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int *cpu, unsigned int shallow_depth) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get_shallow(sbq, shallow_depth); put_cpu(); return nr; } /** * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the * minimum shallow depth that will be used. * @sbq: Bitmap queue in question. * @min_shallow_depth: The minimum shallow depth that will be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). * * sbitmap_queue_clear() batches wakeups as an optimization. The batch size * depends on the depth of the bitmap. Since the shallow allocation functions * effectively operate with a different depth, the shallow depth must be taken * into account when calculating the batch size. This function must be called * with the minimum shallow depth that will be used. Failure to do so can result * in missed wakeups. */ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth); /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @nr: Bit number to free. * @cpu: CPU the bit was allocated on. */ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); } static inline void sbq_index_atomic_inc(atomic_t *index) { int old = atomic_read(index); int new = sbq_index_inc(old); atomic_cmpxchg(index, old, new); } /** * sbq_wait_ptr() - Get the next wait queue to use for a &struct * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) { struct sbq_wait_state *ws; ws = &sbq->ws[atomic_read(wait_index)]; sbq_index_atomic_inc(wait_index); return ws; } /** * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct * sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); /** * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct * seq_file. * @sbq: Bitmap queue to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); struct sbq_wait { struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */ struct wait_queue_entry wait; }; #define DEFINE_SBQ_WAIT(name) \ struct sbq_wait name = { \ .sbq = NULL, \ .wait = { \ .private = current, \ .func = autoremove_wake_function, \ .entry = LIST_HEAD_INIT((name).wait.entry), \ } \ } /* * Wrapper around prepare_to_wait_exclusive(), which maintains some extra * internal state. */ void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state); /* * Must be paired with sbitmap_prepare_to_wait(). */ void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Wrapper around add_wait_queue(), which maintains some extra internal state */ void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Must be paired with sbitmap_add_wait_queue() */ void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait); #endif /* __LINUX_SCALE_BITMAP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_MATH64_H #define _LINUX_MATH64_H #include <linux/types.h> #include <vdso/math64.h> #include <asm/div64.h> #if BITS_PER_LONG == 64 #define div64_long(x, y) div64_s64((x), (y)) #define div64_ul(x, y) div64_u64((x), (y)) /** * div_u64_rem - unsigned 64bit divide with 32bit divisor with remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * @remainder: pointer to unsigned 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor * * This is commonly provided by 32bit archs to provide an optimized 64bit * divide. */ static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /* * div_s64_rem - signed 64bit divide with 32bit divisor with remainder * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * @remainder: pointer to signed 32bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /* * div64_u64_rem - unsigned 64bit divide with 64bit divisor and remainder * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * @remainder: pointer to unsigned 64bit remainder * * Return: sets ``*remainder``, then returns dividend / divisor */ static inline u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder) { *remainder = dividend % divisor; return dividend / divisor; } /* * div64_u64 - unsigned 64bit divide with 64bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Return: dividend / divisor */ static inline u64 div64_u64(u64 dividend, u64 divisor) { return dividend / divisor; } /* * div64_s64 - signed 64bit divide with 64bit divisor * @dividend: signed 64bit dividend * @divisor: signed 64bit divisor * * Return: dividend / divisor */ static inline s64 div64_s64(s64 dividend, s64 divisor) { return dividend / divisor; } #elif BITS_PER_LONG == 32 #define div64_long(x, y) div_s64((x), (y)) #define div64_ul(x, y) div_u64((x), (y)) #ifndef div_u64_rem static inline u64 div_u64_rem(u64 dividend, u32 divisor, u32 *remainder) { *remainder = do_div(dividend, divisor); return dividend; } #endif #ifndef div_s64_rem extern s64 div_s64_rem(s64 dividend, s32 divisor, s32 *remainder); #endif #ifndef div64_u64_rem extern u64 div64_u64_rem(u64 dividend, u64 divisor, u64 *remainder); #endif #ifndef div64_u64 extern u64 div64_u64(u64 dividend, u64 divisor); #endif #ifndef div64_s64 extern s64 div64_s64(s64 dividend, s64 divisor); #endif #endif /* BITS_PER_LONG */ /** * div_u64 - unsigned 64bit divide with 32bit divisor * @dividend: unsigned 64bit dividend * @divisor: unsigned 32bit divisor * * This is the most common 64bit divide and should be used if possible, * as many 32bit archs can optimize this variant better than a full 64bit * divide. */ #ifndef div_u64 static inline u64 div_u64(u64 dividend, u32 divisor) { u32 remainder; return div_u64_rem(dividend, divisor, &remainder); } #endif /** * div_s64 - signed 64bit divide with 32bit divisor * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor */ #ifndef div_s64 static inline s64 div_s64(s64 dividend, s32 divisor) { s32 remainder; return div_s64_rem(dividend, divisor, &remainder); } #endif u32 iter_div_u64_rem(u64 dividend, u32 divisor, u64 *remainder); #ifndef mul_u32_u32 /* * Many a GCC version messes this up and generates a 64x64 mult :-( */ static inline u64 mul_u32_u32(u32 a, u32 b) { return (u64)a * b; } #endif #if defined(CONFIG_ARCH_SUPPORTS_INT128) && defined(__SIZEOF_INT128__) #ifndef mul_u64_u32_shr static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static inline u64 mul_u64_u64_shr(u64 a, u64 mul, unsigned int shift) { return (u64)(((unsigned __int128)a * mul) >> shift); } #endif /* mul_u64_u64_shr */ #else #ifndef mul_u64_u32_shr static inline u64 mul_u64_u32_shr(u64 a, u32 mul, unsigned int shift) { u32 ah, al; u64 ret; al = a; ah = a >> 32; ret = mul_u32_u32(al, mul) >> shift; if (ah) ret += mul_u32_u32(ah, mul) << (32 - shift); return ret; } #endif /* mul_u64_u32_shr */ #ifndef mul_u64_u64_shr static inline u64 mul_u64_u64_shr(u64 a, u64 b, unsigned int shift) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } rl, rm, rn, rh, a0, b0; u64 c; a0.ll = a; b0.ll = b; rl.ll = mul_u32_u32(a0.l.low, b0.l.low); rm.ll = mul_u32_u32(a0.l.low, b0.l.high); rn.ll = mul_u32_u32(a0.l.high, b0.l.low); rh.ll = mul_u32_u32(a0.l.high, b0.l.high); /* * Each of these lines computes a 64-bit intermediate result into "c", * starting at bits 32-95. The low 32-bits go into the result of the * multiplication, the high 32-bits are carried into the next step. */ rl.l.high = c = (u64)rl.l.high + rm.l.low + rn.l.low; rh.l.low = c = (c >> 32) + rm.l.high + rn.l.high + rh.l.low; rh.l.high = (c >> 32) + rh.l.high; /* * The 128-bit result of the multiplication is in rl.ll and rh.ll, * shift it right and throw away the high part of the result. */ if (shift == 0) return rl.ll; if (shift < 64) return (rl.ll >> shift) | (rh.ll << (64 - shift)); return rh.ll >> (shift & 63); } #endif /* mul_u64_u64_shr */ #endif #ifndef mul_u64_u32_div static inline u64 mul_u64_u32_div(u64 a, u32 mul, u32 divisor) { union { u64 ll; struct { #ifdef __BIG_ENDIAN u32 high, low; #else u32 low, high; #endif } l; } u, rl, rh; u.ll = a; rl.ll = mul_u32_u32(u.l.low, mul); rh.ll = mul_u32_u32(u.l.high, mul) + rl.l.high; /* Bits 32-63 of the result will be in rh.l.low. */ rl.l.high = do_div(rh.ll, divisor); /* Bits 0-31 of the result will be in rl.l.low. */ do_div(rl.ll, divisor); rl.l.high = rh.l.low; return rl.ll; } #endif /* mul_u64_u32_div */ u64 mul_u64_u64_div_u64(u64 a, u64 mul, u64 div); #define DIV64_U64_ROUND_UP(ll, d) \ ({ u64 _tmp = (d); div64_u64((ll) + _tmp - 1, _tmp); }) /** * DIV64_U64_ROUND_CLOSEST - unsigned 64bit divide with 64bit divisor rounded to nearest integer * @dividend: unsigned 64bit dividend * @divisor: unsigned 64bit divisor * * Divide unsigned 64bit dividend by unsigned 64bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV64_U64_ROUND_CLOSEST(dividend, divisor) \ ({ u64 _tmp = (divisor); div64_u64((dividend) + _tmp / 2, _tmp); }) /* * DIV_S64_ROUND_CLOSEST - signed 64bit divide with 32bit divisor rounded to nearest integer * @dividend: signed 64bit dividend * @divisor: signed 32bit divisor * * Divide signed 64bit dividend by signed 32bit divisor * and round to closest integer. * * Return: dividend / divisor rounded to nearest integer */ #define DIV_S64_ROUND_CLOSEST(dividend, divisor)( \ { \ s64 __x = (dividend); \ s32 __d = (divisor); \ ((__x > 0) == (__d > 0)) ? \ div_s64((__x + (__d / 2)), __d) : \ div_s64((__x - (__d / 2)), __d); \ } \ ) #endif /* _LINUX_MATH64_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com * Written by Alex Tomas <alex@clusterfs.com> */ #ifndef _EXT4_EXTENTS #define _EXT4_EXTENTS #include "ext4.h" /* * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks * becomes very small, so index split, in-depth growing and * other hard changes happen much more often. * This is for debug purposes only. */ #define AGGRESSIVE_TEST_ /* * With EXTENTS_STATS defined, the number of blocks and extents * are collected in the truncate path. They'll be shown at * umount time. */ #define EXTENTS_STATS__ /* * If CHECK_BINSEARCH is defined, then the results of the binary search * will also be checked by linear search. */ #define CHECK_BINSEARCH__ /* * If EXT_STATS is defined then stats numbers are collected. * These number will be displayed at umount time. */ #define EXT_STATS_ /* * ext4_inode has i_block array (60 bytes total). * The first 12 bytes store ext4_extent_header; * the remainder stores an array of ext4_extent. * For non-inode extent blocks, ext4_extent_tail * follows the array. */ /* * This is the extent tail on-disk structure. * All other extent structures are 12 bytes long. It turns out that * block_size % 12 >= 4 for at least all powers of 2 greater than 512, which * covers all valid ext4 block sizes. Therefore, this tail structure can be * crammed into the end of the block without having to rebalance the tree. */ struct ext4_extent_tail { __le32 et_checksum; /* crc32c(uuid+inum+extent_block) */ }; /* * This is the extent on-disk structure. * It's used at the bottom of the tree. */ struct ext4_extent { __le32 ee_block; /* first logical block extent covers */ __le16 ee_len; /* number of blocks covered by extent */ __le16 ee_start_hi; /* high 16 bits of physical block */ __le32 ee_start_lo; /* low 32 bits of physical block */ }; /* * This is index on-disk structure. * It's used at all the levels except the bottom. */ struct ext4_extent_idx { __le32 ei_block; /* index covers logical blocks from 'block' */ __le32 ei_leaf_lo; /* pointer to the physical block of the next * * level. leaf or next index could be there */ __le16 ei_leaf_hi; /* high 16 bits of physical block */ __u16 ei_unused; }; /* * Each block (leaves and indexes), even inode-stored has header. */ struct ext4_extent_header { __le16 eh_magic; /* probably will support different formats */ __le16 eh_entries; /* number of valid entries */ __le16 eh_max; /* capacity of store in entries */ __le16 eh_depth; /* has tree real underlying blocks? */ __le32 eh_generation; /* generation of the tree */ }; #define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) #define EXT4_MAX_EXTENT_DEPTH 5 #define EXT4_EXTENT_TAIL_OFFSET(hdr) \ (sizeof(struct ext4_extent_header) + \ (sizeof(struct ext4_extent) * le16_to_cpu((hdr)->eh_max))) static inline struct ext4_extent_tail * find_ext4_extent_tail(struct ext4_extent_header *eh) { return (struct ext4_extent_tail *)(((void *)eh) + EXT4_EXTENT_TAIL_OFFSET(eh)); } /* * Array of ext4_ext_path contains path to some extent. * Creation/lookup routines use it for traversal/splitting/etc. * Truncate uses it to simulate recursive walking. */ struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; struct buffer_head *p_bh; }; /* * Used to record a portion of a cluster found at the beginning or end * of an extent while traversing the extent tree during space removal. * A partial cluster may be removed if it does not contain blocks shared * with extents that aren't being deleted (tofree state). Otherwise, * it cannot be removed (nofree state). */ struct partial_cluster { ext4_fsblk_t pclu; /* physical cluster number */ ext4_lblk_t lblk; /* logical block number within logical cluster */ enum {initial, tofree, nofree} state; }; /* * structure for external API */ /* * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an * initialized extent. This is 2^15 and not (2^16 - 1), since we use the * MSB of ee_len field in the extent datastructure to signify if this * particular extent is an initialized extent or an unwritten (i.e. * preallocated). * EXT_UNWRITTEN_MAX_LEN is the maximum number of blocks we can have in an * unwritten extent. * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an * unwritten one. In other words, if MSB of ee_len is set, it is an * unwritten extent with only one special scenario when ee_len = 0x8000. * In this case we can not have an unwritten extent of zero length and * thus we make it as a special case of initialized extent with 0x8000 length. * This way we get better extent-to-group alignment for initialized extents. * Hence, the maximum number of blocks we can have in an *initialized* * extent is 2^15 (32768) and in an *unwritten* extent is 2^15-1 (32767). */ #define EXT_INIT_MAX_LEN (1UL << 15) #define EXT_UNWRITTEN_MAX_LEN (EXT_INIT_MAX_LEN - 1) #define EXT_FIRST_EXTENT(__hdr__) \ ((struct ext4_extent *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_FIRST_INDEX(__hdr__) \ ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ sizeof(struct ext4_extent_header))) #define EXT_HAS_FREE_INDEX(__path__) \ (le16_to_cpu((__path__)->p_hdr->eh_entries) \ < le16_to_cpu((__path__)->p_hdr->eh_max)) #define EXT_LAST_EXTENT(__hdr__) \ (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_MAX_EXTENT(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ : 0) #define EXT_MAX_INDEX(__hdr__) \ ((le16_to_cpu((__hdr__)->eh_max)) ? \ ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { return (struct ext4_extent_header *) EXT4_I(inode)->i_data; } static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) { return (struct ext4_extent_header *) bh->b_data; } static inline unsigned short ext_depth(struct inode *inode) { return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } static inline void ext4_ext_mark_unwritten(struct ext4_extent *ext) { /* We can not have an unwritten extent of zero length! */ BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); } static inline int ext4_ext_is_unwritten(struct ext4_extent *ext) { /* Extent with ee_len of 0x8000 is treated as an initialized extent */ return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); } static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) { return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? le16_to_cpu(ext->ee_len) : (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); } static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) { ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } /* * ext4_ext_pblock: * combine low and high parts of physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_ext_pblock(struct ext4_extent *ex) { ext4_fsblk_t block; block = le32_to_cpu(ex->ee_start_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ex->ee_start_hi) << 31) << 1; return block; } /* * ext4_idx_pblock: * combine low and high parts of a leaf physical block number into ext4_fsblk_t */ static inline ext4_fsblk_t ext4_idx_pblock(struct ext4_extent_idx *ix) { ext4_fsblk_t block; block = le32_to_cpu(ix->ei_leaf_lo); block |= ((ext4_fsblk_t) le16_to_cpu(ix->ei_leaf_hi) << 31) << 1; return block; } /* * ext4_ext_store_pblock: * stores a large physical block number into an extent struct, * breaking it into parts */ static inline void ext4_ext_store_pblock(struct ext4_extent *ex, ext4_fsblk_t pb) { ex->ee_start_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ex->ee_start_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } /* * ext4_idx_store_pblock: * stores a large physical block number into an index struct, * breaking it into parts */ static inline void ext4_idx_store_pblock(struct ext4_extent_idx *ix, ext4_fsblk_t pb) { ix->ei_leaf_lo = cpu_to_le32((unsigned long) (pb & 0xffffffff)); ix->ei_leaf_hi = cpu_to_le16((unsigned long) ((pb >> 31) >> 1) & 0xffff); } #endif /* _EXT4_EXTENTS */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM oom #if !defined(_TRACE_OOM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_OOM_H #include <linux/tracepoint.h> #include <trace/events/mmflags.h> TRACE_EVENT(oom_score_adj_update, TP_PROTO(struct task_struct *task), TP_ARGS(task), TP_STRUCT__entry( __field( pid_t, pid) __array( char, comm, TASK_COMM_LEN ) __field( short, oom_score_adj) ), TP_fast_assign( __entry->pid = task->pid; memcpy(__entry->comm, task->comm, TASK_COMM_LEN); __entry->oom_score_adj = task->signal->oom_score_adj; ), TP_printk("pid=%d comm=%s oom_score_adj=%hd", __entry->pid, __entry->comm, __entry->oom_score_adj) ); TRACE_EVENT(reclaim_retry_zone, TP_PROTO(struct zoneref *zoneref, int order, unsigned long reclaimable, unsigned long available, unsigned long min_wmark, int no_progress_loops, bool wmark_check), TP_ARGS(zoneref, order, reclaimable, available, min_wmark, no_progress_loops, wmark_check), TP_STRUCT__entry( __field( int, node) __field( int, zone_idx) __field( int, order) __field( unsigned long, reclaimable) __field( unsigned long, available) __field( unsigned long, min_wmark) __field( int, no_progress_loops) __field( bool, wmark_check) ), TP_fast_assign( __entry->node = zone_to_nid(zoneref->zone); __entry->zone_idx = zoneref->zone_idx; __entry->order = order; __entry->reclaimable = reclaimable; __entry->available = available; __entry->min_wmark = min_wmark; __entry->no_progress_loops = no_progress_loops; __entry->wmark_check = wmark_check; ), TP_printk("node=%d zone=%-8s order=%d reclaimable=%lu available=%lu min_wmark=%lu no_progress_loops=%d wmark_check=%d", __entry->node, __print_symbolic(__entry->zone_idx, ZONE_TYPE), __entry->order, __entry->reclaimable, __entry->available, __entry->min_wmark, __entry->no_progress_loops, __entry->wmark_check) ); TRACE_EVENT(mark_victim, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(wake_reaper, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(start_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(finish_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); TRACE_EVENT(skip_task_reaping, TP_PROTO(int pid), TP_ARGS(pid), TP_STRUCT__entry( __field(int, pid) ), TP_fast_assign( __entry->pid = pid; ), TP_printk("pid=%d", __entry->pid) ); #ifdef CONFIG_COMPACTION TRACE_EVENT(compact_retry, TP_PROTO(int order, enum compact_priority priority, enum compact_result result, int retries, int max_retries, bool ret), TP_ARGS(order, priority, result, retries, max_retries, ret), TP_STRUCT__entry( __field( int, order) __field( int, priority) __field( int, result) __field( int, retries) __field( int, max_retries) __field( bool, ret) ), TP_fast_assign( __entry->order = order; __entry->priority = priority; __entry->result = compact_result_to_feedback(result); __entry->retries = retries; __entry->max_retries = max_retries; __entry->ret = ret; ), TP_printk("order=%d priority=%s compaction_result=%s retries=%d max_retries=%d should_retry=%d", __entry->order, __print_symbolic(__entry->priority, COMPACTION_PRIORITY), __print_symbolic(__entry->result, COMPACTION_FEEDBACK), __entry->retries, __entry->max_retries, __entry->ret) ); #endif /* CONFIG_COMPACTION */ #endif /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _INET_COMMON_H #define _INET_COMMON_H #include <linux/indirect_call_wrapper.h> extern const struct proto_ops inet_stream_ops; extern const struct proto_ops inet_dgram_ops; /* * INET4 prototypes used by INET6 */ struct msghdr; struct sock; struct sockaddr; struct socket; int inet_release(struct socket *sock); int inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags, int is_sendmsg); int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags); int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern); int inet_send_prepare(struct sock *sk); int inet_sendmsg(struct socket *sock, struct msghdr *msg, size_t size); ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags); int inet_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, int flags); int inet_shutdown(struct socket *sock, int how); int inet_listen(struct socket *sock, int backlog); void inet_sock_destruct(struct sock *sk); int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len); /* Don't allocate port at this moment, defer to connect. */ #define BIND_FORCE_ADDRESS_NO_PORT (1 << 0) /* Grab and release socket lock. */ #define BIND_WITH_LOCK (1 << 1) /* Called from BPF program. */ #define BIND_FROM_BPF (1 << 2) int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len, u32 flags); int inet_getname(struct socket *sock, struct sockaddr *uaddr, int peer); int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); int inet_ctl_sock_create(struct sock **sk, unsigned short family, unsigned short type, unsigned char protocol, struct net *net); int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len); struct sk_buff *inet_gro_receive(struct list_head *head, struct sk_buff *skb); int inet_gro_complete(struct sk_buff *skb, int nhoff); struct sk_buff *inet_gso_segment(struct sk_buff *skb, netdev_features_t features); static inline void inet_ctl_sock_destroy(struct sock *sk) { if (sk) sock_release(sk->sk_socket); } #define indirect_call_gro_receive(f2, f1, cb, head, skb) \ ({ \ unlikely(gro_recursion_inc_test(skb)) ? \ NAPI_GRO_CB(skb)->flush |= 1, NULL : \ INDIRECT_CALL_2(cb, f2, f1, head, skb); \ }) #endif
1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553 2554 2555 2556 2557 2558 2559 2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864 2865 2866 2867 2868 2869 2870 2871 2872 2873 2874 2875 2876 2877 2878 2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932 2933 2934 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 3040 3041 3042 3043 3044 3045 3046 3047 3048 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 // SPDX-License-Identifier: GPL-2.0-only /* * Copyright (C) 1999 Eric Youngdale * Copyright (C) 2014 Christoph Hellwig * * SCSI queueing library. * Initial versions: Eric Youngdale (eric@andante.org). * Based upon conversations with large numbers * of people at Linux Expo. */ #include <linux/bio.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/completion.h> #include <linux/kernel.h> #include <linux/export.h> #include <linux/init.h> #include <linux/pci.h> #include <linux/delay.h> #include <linux/hardirq.h> #include <linux/scatterlist.h> #include <linux/blk-mq.h> #include <linux/ratelimit.h> #include <asm/unaligned.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_dbg.h> #include <scsi/scsi_device.h> #include <scsi/scsi_driver.h> #include <scsi/scsi_eh.h> #include <scsi/scsi_host.h> #include <scsi/scsi_transport.h> /* __scsi_init_queue() */ #include <scsi/scsi_dh.h> #include <trace/events/scsi.h> #include "scsi_debugfs.h" #include "scsi_priv.h" #include "scsi_logging.h" /* * Size of integrity metadata is usually small, 1 inline sg should * cover normal cases. */ #ifdef CONFIG_ARCH_NO_SG_CHAIN #define SCSI_INLINE_PROT_SG_CNT 0 #define SCSI_INLINE_SG_CNT 0 #else #define SCSI_INLINE_PROT_SG_CNT 1 #define SCSI_INLINE_SG_CNT 2 #endif static struct kmem_cache *scsi_sense_cache; static struct kmem_cache *scsi_sense_isadma_cache; static DEFINE_MUTEX(scsi_sense_cache_mutex); static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd); static inline struct kmem_cache * scsi_select_sense_cache(bool unchecked_isa_dma) { return unchecked_isa_dma ? scsi_sense_isadma_cache : scsi_sense_cache; } static void scsi_free_sense_buffer(bool unchecked_isa_dma, unsigned char *sense_buffer) { kmem_cache_free(scsi_select_sense_cache(unchecked_isa_dma), sense_buffer); } static unsigned char *scsi_alloc_sense_buffer(bool unchecked_isa_dma, gfp_t gfp_mask, int numa_node) { return kmem_cache_alloc_node(scsi_select_sense_cache(unchecked_isa_dma), gfp_mask, numa_node); } int scsi_init_sense_cache(struct Scsi_Host *shost) { struct kmem_cache *cache; int ret = 0; mutex_lock(&scsi_sense_cache_mutex); cache = scsi_select_sense_cache(shost->unchecked_isa_dma); if (cache) goto exit; if (shost->unchecked_isa_dma) { scsi_sense_isadma_cache = kmem_cache_create("scsi_sense_cache(DMA)", SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA, NULL); if (!scsi_sense_isadma_cache) ret = -ENOMEM; } else { scsi_sense_cache = kmem_cache_create_usercopy("scsi_sense_cache", SCSI_SENSE_BUFFERSIZE, 0, SLAB_HWCACHE_ALIGN, 0, SCSI_SENSE_BUFFERSIZE, NULL); if (!scsi_sense_cache) ret = -ENOMEM; } exit: mutex_unlock(&scsi_sense_cache_mutex); return ret; } /* * When to reinvoke queueing after a resource shortage. It's 3 msecs to * not change behaviour from the previous unplug mechanism, experimentation * may prove this needs changing. */ #define SCSI_QUEUE_DELAY 3 static void scsi_set_blocked(struct scsi_cmnd *cmd, int reason) { struct Scsi_Host *host = cmd->device->host; struct scsi_device *device = cmd->device; struct scsi_target *starget = scsi_target(device); /* * Set the appropriate busy bit for the device/host. * * If the host/device isn't busy, assume that something actually * completed, and that we should be able to queue a command now. * * Note that the prior mid-layer assumption that any host could * always queue at least one command is now broken. The mid-layer * will implement a user specifiable stall (see * scsi_host.max_host_blocked and scsi_device.max_device_blocked) * if a command is requeued with no other commands outstanding * either for the device or for the host. */ switch (reason) { case SCSI_MLQUEUE_HOST_BUSY: atomic_set(&host->host_blocked, host->max_host_blocked); break; case SCSI_MLQUEUE_DEVICE_BUSY: case SCSI_MLQUEUE_EH_RETRY: atomic_set(&device->device_blocked, device->max_device_blocked); break; case SCSI_MLQUEUE_TARGET_BUSY: atomic_set(&starget->target_blocked, starget->max_target_blocked); break; } } static void scsi_mq_requeue_cmd(struct scsi_cmnd *cmd) { if (cmd->request->rq_flags & RQF_DONTPREP) { cmd->request->rq_flags &= ~RQF_DONTPREP; scsi_mq_uninit_cmd(cmd); } else { WARN_ON_ONCE(true); } blk_mq_requeue_request(cmd->request, true); } /** * __scsi_queue_insert - private queue insertion * @cmd: The SCSI command being requeued * @reason: The reason for the requeue * @unbusy: Whether the queue should be unbusied * * This is a private queue insertion. The public interface * scsi_queue_insert() always assumes the queue should be unbusied * because it's always called before the completion. This function is * for a requeue after completion, which should only occur in this * file. */ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, bool unbusy) { struct scsi_device *device = cmd->device; SCSI_LOG_MLQUEUE(1, scmd_printk(KERN_INFO, cmd, "Inserting command %p into mlqueue\n", cmd)); scsi_set_blocked(cmd, reason); /* * Decrement the counters, since these commands are no longer * active on the host/device. */ if (unbusy) scsi_device_unbusy(device, cmd); /* * Requeue this command. It will go before all other commands * that are already in the queue. Schedule requeue work under * lock such that the kblockd_schedule_work() call happens * before blk_cleanup_queue() finishes. */ cmd->result = 0; blk_mq_requeue_request(cmd->request, true); } /** * scsi_queue_insert - Reinsert a command in the queue. * @cmd: command that we are adding to queue. * @reason: why we are inserting command to queue. * * We do this for one of two cases. Either the host is busy and it cannot accept * any more commands for the time being, or the device returned QUEUE_FULL and * can accept no more commands. * * Context: This could be called either from an interrupt context or a normal * process context. */ void scsi_queue_insert(struct scsi_cmnd *cmd, int reason) { __scsi_queue_insert(cmd, reason, true); } /** * __scsi_execute - insert request and wait for the result * @sdev: scsi device * @cmd: scsi command * @data_direction: data direction * @buffer: data buffer * @bufflen: len of buffer * @sense: optional sense buffer * @sshdr: optional decoded sense header * @timeout: request timeout in seconds * @retries: number of times to retry request * @flags: flags for ->cmd_flags * @rq_flags: flags for ->rq_flags * @resid: optional residual length * * Returns the scsi_cmnd result field if a command was executed, or a negative * Linux error code if we didn't get that far. */ int __scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, int data_direction, void *buffer, unsigned bufflen, unsigned char *sense, struct scsi_sense_hdr *sshdr, int timeout, int retries, u64 flags, req_flags_t rq_flags, int *resid) { struct request *req; struct scsi_request *rq; int ret = DRIVER_ERROR << 24; req = blk_get_request(sdev->request_queue, data_direction == DMA_TO_DEVICE ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, rq_flags & RQF_PM ? BLK_MQ_REQ_PM : 0); if (IS_ERR(req)) return ret; rq = scsi_req(req); if (bufflen && blk_rq_map_kern(sdev->request_queue, req, buffer, bufflen, GFP_NOIO)) goto out; rq->cmd_len = COMMAND_SIZE(cmd[0]); memcpy(rq->cmd, cmd, rq->cmd_len); rq->retries = retries; req->timeout = timeout; req->cmd_flags |= flags; req->rq_flags |= rq_flags | RQF_QUIET; /* * head injection *required* here otherwise quiesce won't work */ blk_execute_rq(req->q, NULL, req, 1); /* * Some devices (USB mass-storage in particular) may transfer * garbage data together with a residue indicating that the data * is invalid. Prevent the garbage from being misinterpreted * and prevent security leaks by zeroing out the excess data. */ if (unlikely(rq->resid_len > 0 && rq->resid_len <= bufflen)) memset(buffer + (bufflen - rq->resid_len), 0, rq->resid_len); if (resid) *resid = rq->resid_len; if (sense && rq->sense_len) memcpy(sense, rq->sense, SCSI_SENSE_BUFFERSIZE); if (sshdr) scsi_normalize_sense(rq->sense, rq->sense_len, sshdr); ret = rq->result; out: blk_put_request(req); return ret; } EXPORT_SYMBOL(__scsi_execute); /* * Wake up the error handler if necessary. Avoid as follows that the error * handler is not woken up if host in-flight requests number == * shost->host_failed: use call_rcu() in scsi_eh_scmd_add() in combination * with an RCU read lock in this function to ensure that this function in * its entirety either finishes before scsi_eh_scmd_add() increases the * host_failed counter or that it notices the shost state change made by * scsi_eh_scmd_add(). */ static void scsi_dec_host_busy(struct Scsi_Host *shost, struct scsi_cmnd *cmd) { unsigned long flags; rcu_read_lock(); __clear_bit(SCMD_STATE_INFLIGHT, &cmd->state); if (unlikely(scsi_host_in_recovery(shost))) { spin_lock_irqsave(shost->host_lock, flags); if (shost->host_failed || shost->host_eh_scheduled) scsi_eh_wakeup(shost); spin_unlock_irqrestore(shost->host_lock, flags); } rcu_read_unlock(); } void scsi_device_unbusy(struct scsi_device *sdev, struct scsi_cmnd *cmd) { struct Scsi_Host *shost = sdev->host; struct scsi_target *starget = scsi_target(sdev); scsi_dec_host_busy(shost, cmd); if (starget->can_queue > 0) atomic_dec(&starget->target_busy); atomic_dec(&sdev->device_busy); } static void scsi_kick_queue(struct request_queue *q) { blk_mq_run_hw_queues(q, false); } /* * Called for single_lun devices on IO completion. Clear starget_sdev_user, * and call blk_run_queue for all the scsi_devices on the target - * including current_sdev first. * * Called with *no* scsi locks held. */ static void scsi_single_lun_run(struct scsi_device *current_sdev) { struct Scsi_Host *shost = current_sdev->host; struct scsi_device *sdev, *tmp; struct scsi_target *starget = scsi_target(current_sdev); unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); starget->starget_sdev_user = NULL; spin_unlock_irqrestore(shost->host_lock, flags); /* * Call blk_run_queue for all LUNs on the target, starting with * current_sdev. We race with others (to set starget_sdev_user), * but in most cases, we will be first. Ideally, each LU on the * target would get some limited time or requests on the target. */ scsi_kick_queue(current_sdev->request_queue); spin_lock_irqsave(shost->host_lock, flags); if (starget->starget_sdev_user) goto out; list_for_each_entry_safe(sdev, tmp, &starget->devices, same_target_siblings) { if (sdev == current_sdev) continue; if (scsi_device_get(sdev)) continue; spin_unlock_irqrestore(shost->host_lock, flags); scsi_kick_queue(sdev->request_queue); spin_lock_irqsave(shost->host_lock, flags); scsi_device_put(sdev); } out: spin_unlock_irqrestore(shost->host_lock, flags); } static inline bool scsi_device_is_busy(struct scsi_device *sdev) { if (atomic_read(&sdev->device_busy) >= sdev->queue_depth) return true; if (atomic_read(&sdev->device_blocked) > 0) return true; return false; } static inline bool scsi_target_is_busy(struct scsi_target *starget) { if (starget->can_queue > 0) { if (atomic_read(&starget->target_busy) >= starget->can_queue) return true; if (atomic_read(&starget->target_blocked) > 0) return true; } return false; } static inline bool scsi_host_is_busy(struct Scsi_Host *shost) { if (atomic_read(&shost->host_blocked) > 0) return true; if (shost->host_self_blocked) return true; return false; } static void scsi_starved_list_run(struct Scsi_Host *shost) { LIST_HEAD(starved_list); struct scsi_device *sdev; unsigned long flags; spin_lock_irqsave(shost->host_lock, flags); list_splice_init(&shost->starved_list, &starved_list); while (!list_empty(&starved_list)) { struct request_queue *slq; /* * As long as shost is accepting commands and we have * starved queues, call blk_run_queue. scsi_request_fn * drops the queue_lock and can add us back to the * starved_list. * * host_lock protects the starved_list and starved_entry. * scsi_request_fn must get the host_lock before checking * or modifying starved_list or starved_entry. */ if (scsi_host_is_busy(shost)) break; sdev = list_entry(starved_list.next, struct scsi_device, starved_entry); list_del_init(&sdev->starved_entry); if (scsi_target_is_busy(scsi_target(sdev))) { list_move_tail(&sdev->starved_entry, &shost->starved_list); continue; } /* * Once we drop the host lock, a racing scsi_remove_device() * call may remove the sdev from the starved list and destroy * it and the queue. Mitigate by taking a reference to the * queue and never touching the sdev again after we drop the * host lock. Note: if __scsi_remove_device() invokes * blk_cleanup_queue() before the queue is run from this * function then blk_run_queue() will return immediately since * blk_cleanup_queue() marks the queue with QUEUE_FLAG_DYING. */ slq = sdev->request_queue; if (!blk_get_queue(slq)) continue; spin_unlock_irqrestore(shost->host_lock, flags); scsi_kick_queue(slq); blk_put_queue(slq); spin_lock_irqsave(shost->host_lock, flags); } /* put any unprocessed entries back */ list_splice(&starved_list, &shost->starved_list); spin_unlock_irqrestore(shost->host_lock, flags); } /** * scsi_run_queue - Select a proper request queue to serve next. * @q: last request's queue * * The previous command was completely finished, start a new one if possible. */ static void scsi_run_queue(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; if (scsi_target(sdev)->single_lun) scsi_single_lun_run(sdev); if (!list_empty(&sdev->host->starved_list)) scsi_starved_list_run(sdev->host); blk_mq_run_hw_queues(q, false); } void scsi_requeue_run_queue(struct work_struct *work) { struct scsi_device *sdev; struct request_queue *q; sdev = container_of(work, struct scsi_device, requeue_work); q = sdev->request_queue; scsi_run_queue(q); } void scsi_run_host_queues(struct Scsi_Host *shost) { struct scsi_device *sdev; shost_for_each_device(sdev, shost) scsi_run_queue(sdev->request_queue); } static void scsi_uninit_cmd(struct scsi_cmnd *cmd) { if (!blk_rq_is_passthrough(cmd->request)) { struct scsi_driver *drv = scsi_cmd_to_driver(cmd); if (drv->uninit_command) drv->uninit_command(cmd); } } void scsi_free_sgtables(struct scsi_cmnd *cmd) { if (cmd->sdb.table.nents) sg_free_table_chained(&cmd->sdb.table, SCSI_INLINE_SG_CNT); if (scsi_prot_sg_count(cmd)) sg_free_table_chained(&cmd->prot_sdb->table, SCSI_INLINE_PROT_SG_CNT); } EXPORT_SYMBOL_GPL(scsi_free_sgtables); static void scsi_mq_uninit_cmd(struct scsi_cmnd *cmd) { scsi_free_sgtables(cmd); scsi_uninit_cmd(cmd); } static void scsi_run_queue_async(struct scsi_device *sdev) { if (scsi_target(sdev)->single_lun || !list_empty(&sdev->host->starved_list)) { kblockd_schedule_work(&sdev->requeue_work); } else { /* * smp_mb() present in sbitmap_queue_clear() or implied in * .end_io is for ordering writing .device_busy in * scsi_device_unbusy() and reading sdev->restarts. */ int old = atomic_read(&sdev->restarts); /* * ->restarts has to be kept as non-zero if new budget * contention occurs. * * No need to run queue when either another re-run * queue wins in updating ->restarts or a new budget * contention occurs. */ if (old && atomic_cmpxchg(&sdev->restarts, old, 0) == old) blk_mq_run_hw_queues(sdev->request_queue, true); } } /* Returns false when no more bytes to process, true if there are more */ static bool scsi_end_request(struct request *req, blk_status_t error, unsigned int bytes) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = cmd->device; struct request_queue *q = sdev->request_queue; if (blk_update_request(req, error, bytes)) return true; if (blk_queue_add_random(q)) add_disk_randomness(req->rq_disk); if (!blk_rq_is_scsi(req)) { WARN_ON_ONCE(!(cmd->flags & SCMD_INITIALIZED)); cmd->flags &= ~SCMD_INITIALIZED; } /* * Calling rcu_barrier() is not necessary here because the * SCSI error handler guarantees that the function called by * call_rcu() has been called before scsi_end_request() is * called. */ destroy_rcu_head(&cmd->rcu); /* * In the MQ case the command gets freed by __blk_mq_end_request, * so we have to do all cleanup that depends on it earlier. * * We also can't kick the queues from irq context, so we * will have to defer it to a workqueue. */ scsi_mq_uninit_cmd(cmd); /* * queue is still alive, so grab the ref for preventing it * from being cleaned up during running queue. */ percpu_ref_get(&q->q_usage_counter); __blk_mq_end_request(req, error); scsi_run_queue_async(sdev); percpu_ref_put(&q->q_usage_counter); return false; } /** * scsi_result_to_blk_status - translate a SCSI result code into blk_status_t * @cmd: SCSI command * @result: scsi error code * * Translate a SCSI result code into a blk_status_t value. May reset the host * byte of @cmd->result. */ static blk_status_t scsi_result_to_blk_status(struct scsi_cmnd *cmd, int result) { switch (host_byte(result)) { case DID_OK: /* * Also check the other bytes than the status byte in result * to handle the case when a SCSI LLD sets result to * DRIVER_SENSE << 24 without setting SAM_STAT_CHECK_CONDITION. */ if (scsi_status_is_good(result) && (result & ~0xff) == 0) return BLK_STS_OK; return BLK_STS_IOERR; case DID_TRANSPORT_FAILFAST: return BLK_STS_TRANSPORT; case DID_TARGET_FAILURE: set_host_byte(cmd, DID_OK); return BLK_STS_TARGET; case DID_NEXUS_FAILURE: set_host_byte(cmd, DID_OK); return BLK_STS_NEXUS; case DID_ALLOC_FAILURE: set_host_byte(cmd, DID_OK); return BLK_STS_NOSPC; case DID_MEDIUM_ERROR: set_host_byte(cmd, DID_OK); return BLK_STS_MEDIUM; default: return BLK_STS_IOERR; } } /* Helper for scsi_io_completion() when "reprep" action required. */ static void scsi_io_completion_reprep(struct scsi_cmnd *cmd, struct request_queue *q) { /* A new command will be prepared and issued. */ scsi_mq_requeue_cmd(cmd); } static bool scsi_cmd_runtime_exceeced(struct scsi_cmnd *cmd) { struct request *req = cmd->request; unsigned long wait_for; if (cmd->allowed == SCSI_CMD_RETRIES_NO_LIMIT) return false; wait_for = (cmd->allowed + 1) * req->timeout; if (time_before(cmd->jiffies_at_alloc + wait_for, jiffies)) { scmd_printk(KERN_ERR, cmd, "timing out command, waited %lus\n", wait_for/HZ); return true; } return false; } /* Helper for scsi_io_completion() when special action required. */ static void scsi_io_completion_action(struct scsi_cmnd *cmd, int result) { struct request_queue *q = cmd->device->request_queue; struct request *req = cmd->request; int level = 0; enum {ACTION_FAIL, ACTION_REPREP, ACTION_RETRY, ACTION_DELAYED_RETRY} action; struct scsi_sense_hdr sshdr; bool sense_valid; bool sense_current = true; /* false implies "deferred sense" */ blk_status_t blk_stat; sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) sense_current = !scsi_sense_is_deferred(&sshdr); blk_stat = scsi_result_to_blk_status(cmd, result); if (host_byte(result) == DID_RESET) { /* Third party bus reset or reset for error recovery * reasons. Just retry the command and see what * happens. */ action = ACTION_RETRY; } else if (sense_valid && sense_current) { switch (sshdr.sense_key) { case UNIT_ATTENTION: if (cmd->device->removable) { /* Detected disc change. Set a bit * and quietly refuse further access. */ cmd->device->changed = 1; action = ACTION_FAIL; } else { /* Must have been a power glitch, or a * bus reset. Could not have been a * media change, so we just retry the * command and see what happens. */ action = ACTION_RETRY; } break; case ILLEGAL_REQUEST: /* If we had an ILLEGAL REQUEST returned, then * we may have performed an unsupported * command. The only thing this should be * would be a ten byte read where only a six * byte read was supported. Also, on a system * where READ CAPACITY failed, we may have * read past the end of the disk. */ if ((cmd->device->use_10_for_rw && sshdr.asc == 0x20 && sshdr.ascq == 0x00) && (cmd->cmnd[0] == READ_10 || cmd->cmnd[0] == WRITE_10)) { /* This will issue a new 6-byte command. */ cmd->device->use_10_for_rw = 0; action = ACTION_REPREP; } else if (sshdr.asc == 0x10) /* DIX */ { action = ACTION_FAIL; blk_stat = BLK_STS_PROTECTION; /* INVALID COMMAND OPCODE or INVALID FIELD IN CDB */ } else if (sshdr.asc == 0x20 || sshdr.asc == 0x24) { action = ACTION_FAIL; blk_stat = BLK_STS_TARGET; } else action = ACTION_FAIL; break; case ABORTED_COMMAND: action = ACTION_FAIL; if (sshdr.asc == 0x10) /* DIF */ blk_stat = BLK_STS_PROTECTION; break; case NOT_READY: /* If the device is in the process of becoming * ready, or has a temporary blockage, retry. */ if (sshdr.asc == 0x04) { switch (sshdr.ascq) { case 0x01: /* becoming ready */ case 0x04: /* format in progress */ case 0x05: /* rebuild in progress */ case 0x06: /* recalculation in progress */ case 0x07: /* operation in progress */ case 0x08: /* Long write in progress */ case 0x09: /* self test in progress */ case 0x11: /* notify (enable spinup) required */ case 0x14: /* space allocation in progress */ case 0x1a: /* start stop unit in progress */ case 0x1b: /* sanitize in progress */ case 0x1d: /* configuration in progress */ case 0x24: /* depopulation in progress */ action = ACTION_DELAYED_RETRY; break; default: action = ACTION_FAIL; break; } } else action = ACTION_FAIL; break; case VOLUME_OVERFLOW: /* See SSC3rXX or current. */ action = ACTION_FAIL; break; case DATA_PROTECT: action = ACTION_FAIL; if ((sshdr.asc == 0x0C && sshdr.ascq == 0x12) || (sshdr.asc == 0x55 && (sshdr.ascq == 0x0E || sshdr.ascq == 0x0F))) { /* Insufficient zone resources */ blk_stat = BLK_STS_ZONE_OPEN_RESOURCE; } break; default: action = ACTION_FAIL; break; } } else action = ACTION_FAIL; if (action != ACTION_FAIL && scsi_cmd_runtime_exceeced(cmd)) action = ACTION_FAIL; switch (action) { case ACTION_FAIL: /* Give up and fail the remainder of the request */ if (!(req->rq_flags & RQF_QUIET)) { static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); if (unlikely(scsi_logging_level)) level = SCSI_LOG_LEVEL(SCSI_LOG_MLCOMPLETE_SHIFT, SCSI_LOG_MLCOMPLETE_BITS); /* * if logging is enabled the failure will be printed * in scsi_log_completion(), so avoid duplicate messages */ if (!level && __ratelimit(&_rs)) { scsi_print_result(cmd, NULL, FAILED); if (driver_byte(result) == DRIVER_SENSE) scsi_print_sense(cmd); scsi_print_command(cmd); } } if (!scsi_end_request(req, blk_stat, blk_rq_err_bytes(req))) return; fallthrough; case ACTION_REPREP: scsi_io_completion_reprep(cmd, q); break; case ACTION_RETRY: /* Retry the same command immediately */ __scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY, false); break; case ACTION_DELAYED_RETRY: /* Retry the same command after a delay */ __scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY, false); break; } } /* * Helper for scsi_io_completion() when cmd->result is non-zero. Returns a * new result that may suppress further error checking. Also modifies * *blk_statp in some cases. */ static int scsi_io_completion_nz_result(struct scsi_cmnd *cmd, int result, blk_status_t *blk_statp) { bool sense_valid; bool sense_current = true; /* false implies "deferred sense" */ struct request *req = cmd->request; struct scsi_sense_hdr sshdr; sense_valid = scsi_command_normalize_sense(cmd, &sshdr); if (sense_valid) sense_current = !scsi_sense_is_deferred(&sshdr); if (blk_rq_is_passthrough(req)) { if (sense_valid) { /* * SG_IO wants current and deferred errors */ scsi_req(req)->sense_len = min(8 + cmd->sense_buffer[7], SCSI_SENSE_BUFFERSIZE); } if (sense_current) *blk_statp = scsi_result_to_blk_status(cmd, result); } else if (blk_rq_bytes(req) == 0 && sense_current) { /* * Flush commands do not transfers any data, and thus cannot use * good_bytes != blk_rq_bytes(req) as the signal for an error. * This sets *blk_statp explicitly for the problem case. */ *blk_statp = scsi_result_to_blk_status(cmd, result); } /* * Recovered errors need reporting, but they're always treated as * success, so fiddle the result code here. For passthrough requests * we already took a copy of the original into sreq->result which * is what gets returned to the user */ if (sense_valid && (sshdr.sense_key == RECOVERED_ERROR)) { bool do_print = true; /* * if ATA PASS-THROUGH INFORMATION AVAILABLE [0x0, 0x1d] * skip print since caller wants ATA registers. Only occurs * on SCSI ATA PASS_THROUGH commands when CK_COND=1 */ if ((sshdr.asc == 0x0) && (sshdr.ascq == 0x1d)) do_print = false; else if (req->rq_flags & RQF_QUIET) do_print = false; if (do_print) scsi_print_sense(cmd); result = 0; /* for passthrough, *blk_statp may be set */ *blk_statp = BLK_STS_OK; } /* * Another corner case: the SCSI status byte is non-zero but 'good'. * Example: PRE-FETCH command returns SAM_STAT_CONDITION_MET when * it is able to fit nominated LBs in its cache (and SAM_STAT_GOOD * if it can't fit). Treat SAM_STAT_CONDITION_MET and the related * intermediate statuses (both obsolete in SAM-4) as good. */ if (status_byte(result) && scsi_status_is_good(result)) { result = 0; *blk_statp = BLK_STS_OK; } return result; } /** * scsi_io_completion - Completion processing for SCSI commands. * @cmd: command that is finished. * @good_bytes: number of processed bytes. * * We will finish off the specified number of sectors. If we are done, the * command block will be released and the queue function will be goosed. If we * are not done then we have to figure out what to do next: * * a) We can call scsi_io_completion_reprep(). The request will be * unprepared and put back on the queue. Then a new command will * be created for it. This should be used if we made forward * progress, or if we want to switch from READ(10) to READ(6) for * example. * * b) We can call scsi_io_completion_action(). The request will be * put back on the queue and retried using the same command as * before, possibly after a delay. * * c) We can call scsi_end_request() with blk_stat other than * BLK_STS_OK, to fail the remainder of the request. */ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) { int result = cmd->result; struct request_queue *q = cmd->device->request_queue; struct request *req = cmd->request; blk_status_t blk_stat = BLK_STS_OK; if (unlikely(result)) /* a nz result may or may not be an error */ result = scsi_io_completion_nz_result(cmd, result, &blk_stat); if (unlikely(blk_rq_is_passthrough(req))) { /* * scsi_result_to_blk_status may have reset the host_byte */ scsi_req(req)->result = cmd->result; } /* * Next deal with any sectors which we were able to correctly * handle. */ SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, cmd, "%u sectors total, %d bytes done.\n", blk_rq_sectors(req), good_bytes)); /* * Failed, zero length commands always need to drop down * to retry code. Fast path should return in this block. */ if (likely(blk_rq_bytes(req) > 0 || blk_stat == BLK_STS_OK)) { if (likely(!scsi_end_request(req, blk_stat, good_bytes))) return; /* no bytes remaining */ } /* Kill remainder if no retries. */ if (unlikely(blk_stat && scsi_noretry_cmd(cmd))) { if (scsi_end_request(req, blk_stat, blk_rq_bytes(req))) WARN_ONCE(true, "Bytes remaining after failed, no-retry command"); return; } /* * If there had been no error, but we have leftover bytes in the * requeues just queue the command up again. */ if (likely(result == 0)) scsi_io_completion_reprep(cmd, q); else scsi_io_completion_action(cmd, result); } static inline bool scsi_cmd_needs_dma_drain(struct scsi_device *sdev, struct request *rq) { return sdev->dma_drain_len && blk_rq_is_passthrough(rq) && !op_is_write(req_op(rq)) && sdev->host->hostt->dma_need_drain(rq); } /** * scsi_alloc_sgtables - allocate S/G tables for a command * @cmd: command descriptor we wish to initialize * * Returns: * * BLK_STS_OK - on success * * BLK_STS_RESOURCE - if the failure is retryable * * BLK_STS_IOERR - if the failure is fatal */ blk_status_t scsi_alloc_sgtables(struct scsi_cmnd *cmd) { struct scsi_device *sdev = cmd->device; struct request *rq = cmd->request; unsigned short nr_segs = blk_rq_nr_phys_segments(rq); struct scatterlist *last_sg = NULL; blk_status_t ret; bool need_drain = scsi_cmd_needs_dma_drain(sdev, rq); int count; if (WARN_ON_ONCE(!nr_segs)) return BLK_STS_IOERR; /* * Make sure there is space for the drain. The driver must adjust * max_hw_segments to be prepared for this. */ if (need_drain) nr_segs++; /* * If sg table allocation fails, requeue request later. */ if (unlikely(sg_alloc_table_chained(&cmd->sdb.table, nr_segs, cmd->sdb.table.sgl, SCSI_INLINE_SG_CNT))) return BLK_STS_RESOURCE; /* * Next, walk the list, and fill in the addresses and sizes of * each segment. */ count = __blk_rq_map_sg(rq->q, rq, cmd->sdb.table.sgl, &last_sg); if (blk_rq_bytes(rq) & rq->q->dma_pad_mask) { unsigned int pad_len = (rq->q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; last_sg->length += pad_len; cmd->extra_len += pad_len; } if (need_drain) { sg_unmark_end(last_sg); last_sg = sg_next(last_sg); sg_set_buf(last_sg, sdev->dma_drain_buf, sdev->dma_drain_len); sg_mark_end(last_sg); cmd->extra_len += sdev->dma_drain_len; count++; } BUG_ON(count > cmd->sdb.table.nents); cmd->sdb.table.nents = count; cmd->sdb.length = blk_rq_payload_bytes(rq); if (blk_integrity_rq(rq)) { struct scsi_data_buffer *prot_sdb = cmd->prot_sdb; int ivecs; if (WARN_ON_ONCE(!prot_sdb)) { /* * This can happen if someone (e.g. multipath) * queues a command to a device on an adapter * that does not support DIX. */ ret = BLK_STS_IOERR; goto out_free_sgtables; } ivecs = blk_rq_count_integrity_sg(rq->q, rq->bio); if (sg_alloc_table_chained(&prot_sdb->table, ivecs, prot_sdb->table.sgl, SCSI_INLINE_PROT_SG_CNT)) { ret = BLK_STS_RESOURCE; goto out_free_sgtables; } count = blk_rq_map_integrity_sg(rq->q, rq->bio, prot_sdb->table.sgl); BUG_ON(count > ivecs); BUG_ON(count > queue_max_integrity_segments(rq->q)); cmd->prot_sdb = prot_sdb; cmd->prot_sdb->table.nents = count; } return BLK_STS_OK; out_free_sgtables: scsi_free_sgtables(cmd); return ret; } EXPORT_SYMBOL(scsi_alloc_sgtables); /** * scsi_initialize_rq - initialize struct scsi_cmnd partially * @rq: Request associated with the SCSI command to be initialized. * * This function initializes the members of struct scsi_cmnd that must be * initialized before request processing starts and that won't be * reinitialized if a SCSI command is requeued. * * Called from inside blk_get_request() for pass-through requests and from * inside scsi_init_command() for filesystem requests. */ static void scsi_initialize_rq(struct request *rq) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); scsi_req_init(&cmd->req); init_rcu_head(&cmd->rcu); cmd->jiffies_at_alloc = jiffies; cmd->retries = 0; } /* * Only called when the request isn't completed by SCSI, and not freed by * SCSI */ static void scsi_cleanup_rq(struct request *rq) { if (rq->rq_flags & RQF_DONTPREP) { scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq)); rq->rq_flags &= ~RQF_DONTPREP; } } /* Called before a request is prepared. See also scsi_mq_prep_fn(). */ void scsi_init_command(struct scsi_device *dev, struct scsi_cmnd *cmd) { void *buf = cmd->sense_buffer; void *prot = cmd->prot_sdb; struct request *rq = blk_mq_rq_from_pdu(cmd); unsigned int flags = cmd->flags & SCMD_PRESERVED_FLAGS; unsigned long jiffies_at_alloc; int retries, to_clear; bool in_flight; if (!blk_rq_is_scsi(rq) && !(flags & SCMD_INITIALIZED)) { flags |= SCMD_INITIALIZED; scsi_initialize_rq(rq); } jiffies_at_alloc = cmd->jiffies_at_alloc; retries = cmd->retries; in_flight = test_bit(SCMD_STATE_INFLIGHT, &cmd->state); /* * Zero out the cmd, except for the embedded scsi_request. Only clear * the driver-private command data if the LLD does not supply a * function to initialize that data. */ to_clear = sizeof(*cmd) - sizeof(cmd->req); if (!dev->host->hostt->init_cmd_priv) to_clear += dev->host->hostt->cmd_size; memset((char *)cmd + sizeof(cmd->req), 0, to_clear); cmd->device = dev; cmd->sense_buffer = buf; cmd->prot_sdb = prot; cmd->flags = flags; INIT_DELAYED_WORK(&cmd->abort_work, scmd_eh_abort_handler); cmd->jiffies_at_alloc = jiffies_at_alloc; cmd->retries = retries; if (in_flight) __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); } static blk_status_t scsi_setup_scsi_cmnd(struct scsi_device *sdev, struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); /* * Passthrough requests may transfer data, in which case they must * a bio attached to them. Or they might contain a SCSI command * that does not transfer data, in which case they may optionally * submit a request without an attached bio. */ if (req->bio) { blk_status_t ret = scsi_alloc_sgtables(cmd); if (unlikely(ret != BLK_STS_OK)) return ret; } else { BUG_ON(blk_rq_bytes(req)); memset(&cmd->sdb, 0, sizeof(cmd->sdb)); } cmd->cmd_len = scsi_req(req)->cmd_len; cmd->cmnd = scsi_req(req)->cmd; cmd->transfersize = blk_rq_bytes(req); cmd->allowed = scsi_req(req)->retries; return BLK_STS_OK; } static blk_status_t scsi_device_state_check(struct scsi_device *sdev, struct request *req) { switch (sdev->sdev_state) { case SDEV_CREATED: return BLK_STS_OK; case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: /* * If the device is offline we refuse to process any * commands. The device must be brought online * before trying any recovery commands. */ if (!sdev->offline_already) { sdev->offline_already = true; sdev_printk(KERN_ERR, sdev, "rejecting I/O to offline device\n"); } return BLK_STS_IOERR; case SDEV_DEL: /* * If the device is fully deleted, we refuse to * process any commands as well. */ sdev_printk(KERN_ERR, sdev, "rejecting I/O to dead device\n"); return BLK_STS_IOERR; case SDEV_BLOCK: case SDEV_CREATED_BLOCK: return BLK_STS_RESOURCE; case SDEV_QUIESCE: /* * If the device is blocked we only accept power management * commands. */ if (req && WARN_ON_ONCE(!(req->rq_flags & RQF_PM))) return BLK_STS_RESOURCE; return BLK_STS_OK; default: /* * For any other not fully online state we only allow * power management commands. */ if (req && !(req->rq_flags & RQF_PM)) return BLK_STS_IOERR; return BLK_STS_OK; } } /* * scsi_dev_queue_ready: if we can send requests to sdev, return 1 else * return 0. * * Called with the queue_lock held. */ static inline int scsi_dev_queue_ready(struct request_queue *q, struct scsi_device *sdev) { unsigned int busy; busy = atomic_inc_return(&sdev->device_busy) - 1; if (atomic_read(&sdev->device_blocked)) { if (busy) goto out_dec; /* * unblock after device_blocked iterates to zero */ if (atomic_dec_return(&sdev->device_blocked) > 0) goto out_dec; SCSI_LOG_MLQUEUE(3, sdev_printk(KERN_INFO, sdev, "unblocking device at zero depth\n")); } if (busy >= sdev->queue_depth) goto out_dec; return 1; out_dec: atomic_dec(&sdev->device_busy); return 0; } /* * scsi_target_queue_ready: checks if there we can send commands to target * @sdev: scsi device on starget to check. */ static inline int scsi_target_queue_ready(struct Scsi_Host *shost, struct scsi_device *sdev) { struct scsi_target *starget = scsi_target(sdev); unsigned int busy; if (starget->single_lun) { spin_lock_irq(shost->host_lock); if (starget->starget_sdev_user && starget->starget_sdev_user != sdev) { spin_unlock_irq(shost->host_lock); return 0; } starget->starget_sdev_user = sdev; spin_unlock_irq(shost->host_lock); } if (starget->can_queue <= 0) return 1; busy = atomic_inc_return(&starget->target_busy) - 1; if (atomic_read(&starget->target_blocked) > 0) { if (busy) goto starved; /* * unblock after target_blocked iterates to zero */ if (atomic_dec_return(&starget->target_blocked) > 0) goto out_dec; SCSI_LOG_MLQUEUE(3, starget_printk(KERN_INFO, starget, "unblocking target at zero depth\n")); } if (busy >= starget->can_queue) goto starved; return 1; starved: spin_lock_irq(shost->host_lock); list_move_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock); out_dec: if (starget->can_queue > 0) atomic_dec(&starget->target_busy); return 0; } /* * scsi_host_queue_ready: if we can send requests to shost, return 1 else * return 0. We must end up running the queue again whenever 0 is * returned, else IO can hang. */ static inline int scsi_host_queue_ready(struct request_queue *q, struct Scsi_Host *shost, struct scsi_device *sdev, struct scsi_cmnd *cmd) { if (scsi_host_in_recovery(shost)) return 0; if (atomic_read(&shost->host_blocked) > 0) { if (scsi_host_busy(shost) > 0) goto starved; /* * unblock after host_blocked iterates to zero */ if (atomic_dec_return(&shost->host_blocked) > 0) goto out_dec; SCSI_LOG_MLQUEUE(3, shost_printk(KERN_INFO, shost, "unblocking host at zero depth\n")); } if (shost->host_self_blocked) goto starved; /* We're OK to process the command, so we can't be starved */ if (!list_empty(&sdev->starved_entry)) { spin_lock_irq(shost->host_lock); if (!list_empty(&sdev->starved_entry)) list_del_init(&sdev->starved_entry); spin_unlock_irq(shost->host_lock); } __set_bit(SCMD_STATE_INFLIGHT, &cmd->state); return 1; starved: spin_lock_irq(shost->host_lock); if (list_empty(&sdev->starved_entry)) list_add_tail(&sdev->starved_entry, &shost->starved_list); spin_unlock_irq(shost->host_lock); out_dec: scsi_dec_host_busy(shost, cmd); return 0; } /* * Busy state exporting function for request stacking drivers. * * For efficiency, no lock is taken to check the busy state of * shost/starget/sdev, since the returned value is not guaranteed and * may be changed after request stacking drivers call the function, * regardless of taking lock or not. * * When scsi can't dispatch I/Os anymore and needs to kill I/Os scsi * needs to return 'not busy'. Otherwise, request stacking drivers * may hold requests forever. */ static bool scsi_mq_lld_busy(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost; if (blk_queue_dying(q)) return false; shost = sdev->host; /* * Ignore host/starget busy state. * Since block layer does not have a concept of fairness across * multiple queues, congestion of host/starget needs to be handled * in SCSI layer. */ if (scsi_host_in_recovery(shost) || scsi_device_is_busy(sdev)) return true; return false; } static void scsi_softirq_done(struct request *rq) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); int disposition; INIT_LIST_HEAD(&cmd->eh_entry); atomic_inc(&cmd->device->iodone_cnt); if (cmd->result) atomic_inc(&cmd->device->ioerr_cnt); disposition = scsi_decide_disposition(cmd); if (disposition != SUCCESS && scsi_cmd_runtime_exceeced(cmd)) disposition = SUCCESS; scsi_log_completion(cmd, disposition); switch (disposition) { case SUCCESS: scsi_finish_command(cmd); break; case NEEDS_RETRY: scsi_queue_insert(cmd, SCSI_MLQUEUE_EH_RETRY); break; case ADD_TO_MLQUEUE: scsi_queue_insert(cmd, SCSI_MLQUEUE_DEVICE_BUSY); break; default: scsi_eh_scmd_add(cmd); break; } } /** * scsi_dispatch_command - Dispatch a command to the low-level driver. * @cmd: command block we are dispatching. * * Return: nonzero return request was rejected and device's queue needs to be * plugged. */ static int scsi_dispatch_cmd(struct scsi_cmnd *cmd) { struct Scsi_Host *host = cmd->device->host; int rtn = 0; atomic_inc(&cmd->device->iorequest_cnt); /* check if the device is still usable */ if (unlikely(cmd->device->sdev_state == SDEV_DEL)) { /* in SDEV_DEL we error all commands. DID_NO_CONNECT * returns an immediate error upwards, and signals * that the device is no longer present */ cmd->result = DID_NO_CONNECT << 16; goto done; } /* Check to see if the scsi lld made this device blocked. */ if (unlikely(scsi_device_blocked(cmd->device))) { /* * in blocked state, the command is just put back on * the device queue. The suspend state has already * blocked the queue so future requests should not * occur until the device transitions out of the * suspend state. */ SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : device blocked\n")); return SCSI_MLQUEUE_DEVICE_BUSY; } /* Store the LUN value in cmnd, if needed. */ if (cmd->device->lun_in_cdb) cmd->cmnd[1] = (cmd->cmnd[1] & 0x1f) | (cmd->device->lun << 5 & 0xe0); scsi_log_send(cmd); /* * Before we queue this command, check if the command * length exceeds what the host adapter can handle. */ if (cmd->cmd_len > cmd->device->host->max_cmd_len) { SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : command too long. " "cdb_size=%d host->max_cmd_len=%d\n", cmd->cmd_len, cmd->device->host->max_cmd_len)); cmd->result = (DID_ABORT << 16); goto done; } if (unlikely(host->shost_state == SHOST_DEL)) { cmd->result = (DID_NO_CONNECT << 16); goto done; } trace_scsi_dispatch_cmd_start(cmd); rtn = host->hostt->queuecommand(host, cmd); if (rtn) { trace_scsi_dispatch_cmd_error(cmd, rtn); if (rtn != SCSI_MLQUEUE_DEVICE_BUSY && rtn != SCSI_MLQUEUE_TARGET_BUSY) rtn = SCSI_MLQUEUE_HOST_BUSY; SCSI_LOG_MLQUEUE(3, scmd_printk(KERN_INFO, cmd, "queuecommand : request rejected\n")); } return rtn; done: cmd->scsi_done(cmd); return 0; } /* Size in bytes of the sg-list stored in the scsi-mq command-private data. */ static unsigned int scsi_mq_inline_sgl_size(struct Scsi_Host *shost) { return min_t(unsigned int, shost->sg_tablesize, SCSI_INLINE_SG_CNT) * sizeof(struct scatterlist); } static blk_status_t scsi_prepare_cmd(struct request *req) { struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); struct scsi_device *sdev = req->q->queuedata; struct Scsi_Host *shost = sdev->host; struct scatterlist *sg; scsi_init_command(sdev, cmd); cmd->request = req; cmd->tag = req->tag; cmd->prot_op = SCSI_PROT_NORMAL; if (blk_rq_bytes(req)) cmd->sc_data_direction = rq_dma_dir(req); else cmd->sc_data_direction = DMA_NONE; sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->sdb.table.sgl = sg; if (scsi_host_get_prot(shost)) { memset(cmd->prot_sdb, 0, sizeof(struct scsi_data_buffer)); cmd->prot_sdb->table.sgl = (struct scatterlist *)(cmd->prot_sdb + 1); } /* * Special handling for passthrough commands, which don't go to the ULP * at all: */ if (blk_rq_is_scsi(req)) return scsi_setup_scsi_cmnd(sdev, req); if (sdev->handler && sdev->handler->prep_fn) { blk_status_t ret = sdev->handler->prep_fn(sdev, req); if (ret != BLK_STS_OK) return ret; } cmd->cmnd = scsi_req(req)->cmd = scsi_req(req)->__cmd; memset(cmd->cmnd, 0, BLK_MAX_CDB); return scsi_cmd_to_driver(cmd)->init_command(cmd); } static void scsi_mq_done(struct scsi_cmnd *cmd) { if (unlikely(blk_should_fake_timeout(cmd->request->q))) return; if (unlikely(test_and_set_bit(SCMD_STATE_COMPLETE, &cmd->state))) return; trace_scsi_dispatch_cmd_done(cmd); blk_mq_complete_request(cmd->request); } static void scsi_mq_put_budget(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; atomic_dec(&sdev->device_busy); } static bool scsi_mq_get_budget(struct request_queue *q) { struct scsi_device *sdev = q->queuedata; if (scsi_dev_queue_ready(q, sdev)) return true; atomic_inc(&sdev->restarts); /* * Orders atomic_inc(&sdev->restarts) and atomic_read(&sdev->device_busy). * .restarts must be incremented before .device_busy is read because the * code in scsi_run_queue_async() depends on the order of these operations. */ smp_mb__after_atomic(); /* * If all in-flight requests originated from this LUN are completed * before reading .device_busy, sdev->device_busy will be observed as * zero, then blk_mq_delay_run_hw_queues() will dispatch this request * soon. Otherwise, completion of one of these requests will observe * the .restarts flag, and the request queue will be run for handling * this request, see scsi_end_request(). */ if (unlikely(atomic_read(&sdev->device_busy) == 0 && !scsi_device_blocked(sdev))) blk_mq_delay_run_hw_queues(sdev->request_queue, SCSI_QUEUE_DELAY); return false; } static blk_status_t scsi_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *req = bd->rq; struct request_queue *q = req->q; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(req); blk_status_t ret; int reason; /* * If the device is not in running state we will reject some or all * commands. */ if (unlikely(sdev->sdev_state != SDEV_RUNNING)) { ret = scsi_device_state_check(sdev, req); if (ret != BLK_STS_OK) goto out_put_budget; } ret = BLK_STS_RESOURCE; if (!scsi_target_queue_ready(shost, sdev)) goto out_put_budget; if (!scsi_host_queue_ready(q, shost, sdev, cmd)) goto out_dec_target_busy; if (!(req->rq_flags & RQF_DONTPREP)) { ret = scsi_prepare_cmd(req); if (ret != BLK_STS_OK) goto out_dec_host_busy; req->rq_flags |= RQF_DONTPREP; } else { clear_bit(SCMD_STATE_COMPLETE, &cmd->state); } cmd->flags &= SCMD_PRESERVED_FLAGS; if (sdev->simple_tags) cmd->flags |= SCMD_TAGGED; if (bd->last) cmd->flags |= SCMD_LAST; scsi_set_resid(cmd, 0); memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); cmd->scsi_done = scsi_mq_done; blk_mq_start_request(req); reason = scsi_dispatch_cmd(cmd); if (reason) { scsi_set_blocked(cmd, reason); ret = BLK_STS_RESOURCE; goto out_dec_host_busy; } return BLK_STS_OK; out_dec_host_busy: scsi_dec_host_busy(shost, cmd); out_dec_target_busy: if (scsi_target(sdev)->can_queue > 0) atomic_dec(&scsi_target(sdev)->target_busy); out_put_budget: scsi_mq_put_budget(q); switch (ret) { case BLK_STS_OK: break; case BLK_STS_RESOURCE: case BLK_STS_ZONE_RESOURCE: if (scsi_device_blocked(sdev)) ret = BLK_STS_DEV_RESOURCE; break; default: if (unlikely(!scsi_device_online(sdev))) scsi_req(req)->result = DID_NO_CONNECT << 16; else scsi_req(req)->result = DID_ERROR << 16; /* * Make sure to release all allocated resources when * we hit an error, as we will never see this command * again. */ if (req->rq_flags & RQF_DONTPREP) scsi_mq_uninit_cmd(cmd); scsi_run_queue_async(sdev); break; } return ret; } static enum blk_eh_timer_return scsi_timeout(struct request *req, bool reserved) { if (reserved) return BLK_EH_RESET_TIMER; return scsi_times_out(req); } static int scsi_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx, unsigned int numa_node) { struct Scsi_Host *shost = set->driver_data; const bool unchecked_isa_dma = shost->unchecked_isa_dma; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); struct scatterlist *sg; int ret = 0; if (unchecked_isa_dma) cmd->flags |= SCMD_UNCHECKED_ISA_DMA; cmd->sense_buffer = scsi_alloc_sense_buffer(unchecked_isa_dma, GFP_KERNEL, numa_node); if (!cmd->sense_buffer) return -ENOMEM; cmd->req.sense = cmd->sense_buffer; if (scsi_host_get_prot(shost)) { sg = (void *)cmd + sizeof(struct scsi_cmnd) + shost->hostt->cmd_size; cmd->prot_sdb = (void *)sg + scsi_mq_inline_sgl_size(shost); } if (shost->hostt->init_cmd_priv) { ret = shost->hostt->init_cmd_priv(shost, cmd); if (ret < 0) scsi_free_sense_buffer(unchecked_isa_dma, cmd->sense_buffer); } return ret; } static void scsi_mq_exit_request(struct blk_mq_tag_set *set, struct request *rq, unsigned int hctx_idx) { struct Scsi_Host *shost = set->driver_data; struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); if (shost->hostt->exit_cmd_priv) shost->hostt->exit_cmd_priv(shost, cmd); scsi_free_sense_buffer(cmd->flags & SCMD_UNCHECKED_ISA_DMA, cmd->sense_buffer); } static int scsi_map_queues(struct blk_mq_tag_set *set) { struct Scsi_Host *shost = container_of(set, struct Scsi_Host, tag_set); if (shost->hostt->map_queues) return shost->hostt->map_queues(shost); return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } void __scsi_init_queue(struct Scsi_Host *shost, struct request_queue *q) { struct device *dev = shost->dma_dev; /* * this limit is imposed by hardware restrictions */ blk_queue_max_segments(q, min_t(unsigned short, shost->sg_tablesize, SG_MAX_SEGMENTS)); if (scsi_host_prot_dma(shost)) { shost->sg_prot_tablesize = min_not_zero(shost->sg_prot_tablesize, (unsigned short)SCSI_MAX_PROT_SG_SEGMENTS); BUG_ON(shost->sg_prot_tablesize < shost->sg_tablesize); blk_queue_max_integrity_segments(q, shost->sg_prot_tablesize); } if (dev->dma_mask) { shost->max_sectors = min_t(unsigned int, shost->max_sectors, dma_max_mapping_size(dev) >> SECTOR_SHIFT); } blk_queue_max_hw_sectors(q, shost->max_sectors); if (shost->unchecked_isa_dma) blk_queue_bounce_limit(q, BLK_BOUNCE_ISA); blk_queue_segment_boundary(q, shost->dma_boundary); dma_set_seg_boundary(dev, shost->dma_boundary); blk_queue_max_segment_size(q, shost->max_segment_size); blk_queue_virt_boundary(q, shost->virt_boundary_mask); dma_set_max_seg_size(dev, queue_max_segment_size(q)); /* * Set a reasonable default alignment: The larger of 32-byte (dword), * which is a common minimum for HBAs, and the minimum DMA alignment, * which is set by the platform. * * Devices that require a bigger alignment can increase it later. */ blk_queue_dma_alignment(q, max(4, dma_get_cache_alignment()) - 1); } EXPORT_SYMBOL_GPL(__scsi_init_queue); static const struct blk_mq_ops scsi_mq_ops_no_commit = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .complete = scsi_softirq_done, .timeout = scsi_timeout, #ifdef CONFIG_BLK_DEBUG_FS .show_rq = scsi_show_rq, #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .initialize_rq_fn = scsi_initialize_rq, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, }; static void scsi_commit_rqs(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct scsi_device *sdev = q->queuedata; struct Scsi_Host *shost = sdev->host; shost->hostt->commit_rqs(shost, hctx->queue_num); } static const struct blk_mq_ops scsi_mq_ops = { .get_budget = scsi_mq_get_budget, .put_budget = scsi_mq_put_budget, .queue_rq = scsi_queue_rq, .commit_rqs = scsi_commit_rqs, .complete = scsi_softirq_done, .timeout = scsi_timeout, #ifdef CONFIG_BLK_DEBUG_FS .show_rq = scsi_show_rq, #endif .init_request = scsi_mq_init_request, .exit_request = scsi_mq_exit_request, .initialize_rq_fn = scsi_initialize_rq, .cleanup_rq = scsi_cleanup_rq, .busy = scsi_mq_lld_busy, .map_queues = scsi_map_queues, }; struct request_queue *scsi_mq_alloc_queue(struct scsi_device *sdev) { sdev->request_queue = blk_mq_init_queue(&sdev->host->tag_set); if (IS_ERR(sdev->request_queue)) return NULL; sdev->request_queue->queuedata = sdev; __scsi_init_queue(sdev->host, sdev->request_queue); blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, sdev->request_queue); return sdev->request_queue; } int scsi_mq_setup_tags(struct Scsi_Host *shost) { unsigned int cmd_size, sgl_size; struct blk_mq_tag_set *tag_set = &shost->tag_set; sgl_size = max_t(unsigned int, sizeof(struct scatterlist), scsi_mq_inline_sgl_size(shost)); cmd_size = sizeof(struct scsi_cmnd) + shost->hostt->cmd_size + sgl_size; if (scsi_host_get_prot(shost)) cmd_size += sizeof(struct scsi_data_buffer) + sizeof(struct scatterlist) * SCSI_INLINE_PROT_SG_CNT; memset(tag_set, 0, sizeof(*tag_set)); if (shost->hostt->commit_rqs) tag_set->ops = &scsi_mq_ops; else tag_set->ops = &scsi_mq_ops_no_commit; tag_set->nr_hw_queues = shost->nr_hw_queues ? : 1; tag_set->queue_depth = shost->can_queue; tag_set->cmd_size = cmd_size; tag_set->numa_node = NUMA_NO_NODE; tag_set->flags = BLK_MQ_F_SHOULD_MERGE; tag_set->flags |= BLK_ALLOC_POLICY_TO_MQ_FLAG(shost->hostt->tag_alloc_policy); tag_set->driver_data = shost; if (shost->host_tagset) tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; return blk_mq_alloc_tag_set(tag_set); } void scsi_mq_destroy_tags(struct Scsi_Host *shost) { blk_mq_free_tag_set(&shost->tag_set); } /** * scsi_device_from_queue - return sdev associated with a request_queue * @q: The request queue to return the sdev from * * Return the sdev associated with a request queue or NULL if the * request_queue does not reference a SCSI device. */ struct scsi_device *scsi_device_from_queue(struct request_queue *q) { struct scsi_device *sdev = NULL; if (q->mq_ops == &scsi_mq_ops_no_commit || q->mq_ops == &scsi_mq_ops) sdev = q->queuedata; if (!sdev || !get_device(&sdev->sdev_gendev)) sdev = NULL; return sdev; } /** * scsi_block_requests - Utility function used by low-level drivers to prevent * further commands from being queued to the device. * @shost: host in question * * There is no timer nor any other means by which the requests get unblocked * other than the low-level driver calling scsi_unblock_requests(). */ void scsi_block_requests(struct Scsi_Host *shost) { shost->host_self_blocked = 1; } EXPORT_SYMBOL(scsi_block_requests); /** * scsi_unblock_requests - Utility function used by low-level drivers to allow * further commands to be queued to the device. * @shost: host in question * * There is no timer nor any other means by which the requests get unblocked * other than the low-level driver calling scsi_unblock_requests(). This is done * as an API function so that changes to the internals of the scsi mid-layer * won't require wholesale changes to drivers that use this feature. */ void scsi_unblock_requests(struct Scsi_Host *shost) { shost->host_self_blocked = 0; scsi_run_host_queues(shost); } EXPORT_SYMBOL(scsi_unblock_requests); void scsi_exit_queue(void) { kmem_cache_destroy(scsi_sense_cache); kmem_cache_destroy(scsi_sense_isadma_cache); } /** * scsi_mode_select - issue a mode select * @sdev: SCSI device to be queried * @pf: Page format bit (1 == standard, 0 == vendor specific) * @sp: Save page bit (0 == don't save, 1 == save) * @modepage: mode page being requested * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout * @retries: number of retries before failing * @data: returns a structure abstracting the mode header data * @sshdr: place to put sense data (or NULL if no sense to be collected). * must be SCSI_SENSE_BUFFERSIZE big. * * Returns zero if successful; negative error number or scsi * status on error * */ int scsi_mode_select(struct scsi_device *sdev, int pf, int sp, int modepage, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[10]; unsigned char *real_buffer; int ret; memset(cmd, 0, sizeof(cmd)); cmd[1] = (pf ? 0x10 : 0) | (sp ? 0x01 : 0); if (sdev->use_10_for_ms) { if (len > 65535) return -EINVAL; real_buffer = kmalloc(8 + len, GFP_KERNEL); if (!real_buffer) return -ENOMEM; memcpy(real_buffer + 8, buffer, len); len += 8; real_buffer[0] = 0; real_buffer[1] = 0; real_buffer[2] = data->medium_type; real_buffer[3] = data->device_specific; real_buffer[4] = data->longlba ? 0x01 : 0; real_buffer[5] = 0; real_buffer[6] = data->block_descriptor_length >> 8; real_buffer[7] = data->block_descriptor_length; cmd[0] = MODE_SELECT_10; cmd[7] = len >> 8; cmd[8] = len; } else { if (len > 255 || data->block_descriptor_length > 255 || data->longlba) return -EINVAL; real_buffer = kmalloc(4 + len, GFP_KERNEL); if (!real_buffer) return -ENOMEM; memcpy(real_buffer + 4, buffer, len); len += 4; real_buffer[0] = 0; real_buffer[1] = data->medium_type; real_buffer[2] = data->device_specific; real_buffer[3] = data->block_descriptor_length; cmd[0] = MODE_SELECT; cmd[4] = len; } ret = scsi_execute_req(sdev, cmd, DMA_TO_DEVICE, real_buffer, len, sshdr, timeout, retries, NULL); kfree(real_buffer); return ret; } EXPORT_SYMBOL_GPL(scsi_mode_select); /** * scsi_mode_sense - issue a mode sense, falling back from 10 to six bytes if necessary. * @sdev: SCSI device to be queried * @dbd: set if mode sense will allow block descriptors to be returned * @modepage: mode page being requested * @buffer: request buffer (may not be smaller than eight bytes) * @len: length of request buffer. * @timeout: command timeout * @retries: number of retries before failing * @data: returns a structure abstracting the mode header data * @sshdr: place to put sense data (or NULL if no sense to be collected). * must be SCSI_SENSE_BUFFERSIZE big. * * Returns zero if successful, or a negative error number on failure */ int scsi_mode_sense(struct scsi_device *sdev, int dbd, int modepage, unsigned char *buffer, int len, int timeout, int retries, struct scsi_mode_data *data, struct scsi_sense_hdr *sshdr) { unsigned char cmd[12]; int use_10_for_ms; int header_length; int result, retry_count = retries; struct scsi_sense_hdr my_sshdr; memset(data, 0, sizeof(*data)); memset(&cmd[0], 0, 12); dbd = sdev->set_dbd_for_ms ? 8 : dbd; cmd[1] = dbd & 0x18; /* allows DBD and LLBA bits */ cmd[2] = modepage; /* caller might not be interested in sense, but we need it */ if (!sshdr) sshdr = &my_sshdr; retry: use_10_for_ms = sdev->use_10_for_ms; if (use_10_for_ms) { if (len < 8) len = 8; cmd[0] = MODE_SENSE_10; cmd[8] = len; header_length = 8; } else { if (len < 4) len = 4; cmd[0] = MODE_SENSE; cmd[4] = len; header_length = 4; } memset(buffer, 0, len); result = scsi_execute_req(sdev, cmd, DMA_FROM_DEVICE, buffer, len, sshdr, timeout, retries, NULL); if (result < 0) return result; /* This code looks awful: what it's doing is making sure an * ILLEGAL REQUEST sense return identifies the actual command * byte as the problem. MODE_SENSE commands can return * ILLEGAL REQUEST if the code page isn't supported */ if (use_10_for_ms && !scsi_status_is_good(result) && driver_byte(result) == DRIVER_SENSE) { if (scsi_sense_valid(sshdr)) { if ((sshdr->sense_key == ILLEGAL_REQUEST) && (sshdr->asc == 0x20) && (sshdr->ascq == 0)) { /* * Invalid command operation code */ sdev->use_10_for_ms = 0; goto retry; } } } if (scsi_status_is_good(result)) { if (unlikely(buffer[0] == 0x86 && buffer[1] == 0x0b && (modepage == 6 || modepage == 8))) { /* Initio breakage? */ header_length = 0; data->length = 13; data->medium_type = 0; data->device_specific = 0; data->longlba = 0; data->block_descriptor_length = 0; } else if (use_10_for_ms) { data->length = buffer[0]*256 + buffer[1] + 2; data->medium_type = buffer[2]; data->device_specific = buffer[3]; data->longlba = buffer[4] & 0x01; data->block_descriptor_length = buffer[6]*256 + buffer[7]; } else { data->length = buffer[0] + 1; data->medium_type = buffer[1]; data->device_specific = buffer[2]; data->block_descriptor_length = buffer[3]; } data->header_length = header_length; result = 0; } else if ((status_byte(result) == CHECK_CONDITION) && scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION && retry_count) { retry_count--; goto retry; } if (result > 0) result = -EIO; return result; } EXPORT_SYMBOL(scsi_mode_sense); /** * scsi_test_unit_ready - test if unit is ready * @sdev: scsi device to change the state of. * @timeout: command timeout * @retries: number of retries before failing * @sshdr: outpout pointer for decoded sense information. * * Returns zero if unsuccessful or an error if TUR failed. For * removable media, UNIT_ATTENTION sets ->changed flag. **/ int scsi_test_unit_ready(struct scsi_device *sdev, int timeout, int retries, struct scsi_sense_hdr *sshdr) { char cmd[] = { TEST_UNIT_READY, 0, 0, 0, 0, 0, }; int result; /* try to eat the UNIT_ATTENTION if there are enough retries */ do { result = scsi_execute_req(sdev, cmd, DMA_NONE, NULL, 0, sshdr, timeout, 1, NULL); if (sdev->removable && scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION) sdev->changed = 1; } while (scsi_sense_valid(sshdr) && sshdr->sense_key == UNIT_ATTENTION && --retries); return result; } EXPORT_SYMBOL(scsi_test_unit_ready); /** * scsi_device_set_state - Take the given device through the device state model. * @sdev: scsi device to change the state of. * @state: state to change to. * * Returns zero if successful or an error if the requested * transition is illegal. */ int scsi_device_set_state(struct scsi_device *sdev, enum scsi_device_state state) { enum scsi_device_state oldstate = sdev->sdev_state; if (state == oldstate) return 0; switch (state) { case SDEV_CREATED: switch (oldstate) { case SDEV_CREATED_BLOCK: break; default: goto illegal; } break; case SDEV_RUNNING: switch (oldstate) { case SDEV_CREATED: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: case SDEV_QUIESCE: case SDEV_BLOCK: break; default: goto illegal; } break; case SDEV_QUIESCE: switch (oldstate) { case SDEV_RUNNING: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: break; default: goto illegal; } break; case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_QUIESCE: case SDEV_BLOCK: break; default: goto illegal; } break; case SDEV_BLOCK: switch (oldstate) { case SDEV_RUNNING: case SDEV_CREATED_BLOCK: case SDEV_QUIESCE: case SDEV_OFFLINE: break; default: goto illegal; } break; case SDEV_CREATED_BLOCK: switch (oldstate) { case SDEV_CREATED: break; default: goto illegal; } break; case SDEV_CANCEL: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_QUIESCE: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: break; default: goto illegal; } break; case SDEV_DEL: switch (oldstate) { case SDEV_CREATED: case SDEV_RUNNING: case SDEV_OFFLINE: case SDEV_TRANSPORT_OFFLINE: case SDEV_CANCEL: case SDEV_BLOCK: case SDEV_CREATED_BLOCK: break; default: goto illegal; } break; } sdev->offline_already = false; sdev->sdev_state = state; return 0; illegal: SCSI_LOG_ERROR_RECOVERY(1, sdev_printk(KERN_ERR, sdev, "Illegal state transition %s->%s", scsi_device_state_name(oldstate), scsi_device_state_name(state)) ); return -EINVAL; } EXPORT_SYMBOL(scsi_device_set_state); /** * sdev_evt_emit - emit a single SCSI device uevent * @sdev: associated SCSI device * @evt: event to emit * * Send a single uevent (scsi_event) to the associated scsi_device. */ static void scsi_evt_emit(struct scsi_device *sdev, struct scsi_event *evt) { int idx = 0; char *envp[3]; switch (evt->evt_type) { case SDEV_EVT_MEDIA_CHANGE: envp[idx++] = "SDEV_MEDIA_CHANGE=1"; break; case SDEV_EVT_INQUIRY_CHANGE_REPORTED: scsi_rescan_device(&sdev->sdev_gendev); envp[idx++] = "SDEV_UA=INQUIRY_DATA_HAS_CHANGED"; break; case SDEV_EVT_CAPACITY_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=CAPACITY_DATA_HAS_CHANGED"; break; case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED: envp[idx++] = "SDEV_UA=THIN_PROVISIONING_SOFT_THRESHOLD_REACHED"; break; case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=MODE_PARAMETERS_CHANGED"; break; case SDEV_EVT_LUN_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=REPORTED_LUNS_DATA_HAS_CHANGED"; break; case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED: envp[idx++] = "SDEV_UA=ASYMMETRIC_ACCESS_STATE_CHANGED"; break; case SDEV_EVT_POWER_ON_RESET_OCCURRED: envp[idx++] = "SDEV_UA=POWER_ON_RESET_OCCURRED"; break; default: /* do nothing */ break; } envp[idx++] = NULL; kobject_uevent_env(&sdev->sdev_gendev.kobj, KOBJ_CHANGE, envp); } /** * sdev_evt_thread - send a uevent for each scsi event * @work: work struct for scsi_device * * Dispatch queued events to their associated scsi_device kobjects * as uevents. */ void scsi_evt_thread(struct work_struct *work) { struct scsi_device *sdev; enum scsi_device_event evt_type; LIST_HEAD(event_list); sdev = container_of(work, struct scsi_device, event_work); for (evt_type = SDEV_EVT_FIRST; evt_type <= SDEV_EVT_LAST; evt_type++) if (test_and_clear_bit(evt_type, sdev->pending_events)) sdev_evt_send_simple(sdev, evt_type, GFP_KERNEL); while (1) { struct scsi_event *evt; struct list_head *this, *tmp; unsigned long flags; spin_lock_irqsave(&sdev->list_lock, flags); list_splice_init(&sdev->event_list, &event_list); spin_unlock_irqrestore(&sdev->list_lock, flags); if (list_empty(&event_list)) break; list_for_each_safe(this, tmp, &event_list) { evt = list_entry(this, struct scsi_event, node); list_del(&evt->node); scsi_evt_emit(sdev, evt); kfree(evt); } } } /** * sdev_evt_send - send asserted event to uevent thread * @sdev: scsi_device event occurred on * @evt: event to send * * Assert scsi device event asynchronously. */ void sdev_evt_send(struct scsi_device *sdev, struct scsi_event *evt) { unsigned long flags; #if 0 /* FIXME: currently this check eliminates all media change events * for polled devices. Need to update to discriminate between AN * and polled events */ if (!test_bit(evt->evt_type, sdev->supported_events)) { kfree(evt); return; } #endif spin_lock_irqsave(&sdev->list_lock, flags); list_add_tail(&evt->node, &sdev->event_list); schedule_work(&sdev->event_work); spin_unlock_irqrestore(&sdev->list_lock, flags); } EXPORT_SYMBOL_GPL(sdev_evt_send); /** * sdev_evt_alloc - allocate a new scsi event * @evt_type: type of event to allocate * @gfpflags: GFP flags for allocation * * Allocates and returns a new scsi_event. */ struct scsi_event *sdev_evt_alloc(enum scsi_device_event evt_type, gfp_t gfpflags) { struct scsi_event *evt = kzalloc(sizeof(struct scsi_event), gfpflags); if (!evt) return NULL; evt->evt_type = evt_type; INIT_LIST_HEAD(&evt->node); /* evt_type-specific initialization, if any */ switch (evt_type) { case SDEV_EVT_MEDIA_CHANGE: case SDEV_EVT_INQUIRY_CHANGE_REPORTED: case SDEV_EVT_CAPACITY_CHANGE_REPORTED: case SDEV_EVT_SOFT_THRESHOLD_REACHED_REPORTED: case SDEV_EVT_MODE_PARAMETER_CHANGE_REPORTED: case SDEV_EVT_LUN_CHANGE_REPORTED: case SDEV_EVT_ALUA_STATE_CHANGE_REPORTED: case SDEV_EVT_POWER_ON_RESET_OCCURRED: default: /* do nothing */ break; } return evt; } EXPORT_SYMBOL_GPL(sdev_evt_alloc); /** * sdev_evt_send_simple - send asserted event to uevent thread * @sdev: scsi_device event occurred on * @evt_type: type of event to send * @gfpflags: GFP flags for allocation * * Assert scsi device event asynchronously, given an event type. */ void sdev_evt_send_simple(struct scsi_device *sdev, enum scsi_device_event evt_type, gfp_t gfpflags) { struct scsi_event *evt = sdev_evt_alloc(evt_type, gfpflags); if (!evt) { sdev_printk(KERN_ERR, sdev, "event %d eaten due to OOM\n", evt_type); return; } sdev_evt_send(sdev, evt); } EXPORT_SYMBOL_GPL(sdev_evt_send_simple); /** * scsi_device_quiesce - Block all commands except power management. * @sdev: scsi device to quiesce. * * This works by trying to transition to the SDEV_QUIESCE state * (which must be a legal transition). When the device is in this * state, only power management requests will be accepted, all others will * be deferred. * * Must be called with user context, may sleep. * * Returns zero if unsuccessful or an error if not. */ int scsi_device_quiesce(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; int err; /* * It is allowed to call scsi_device_quiesce() multiple times from * the same context but concurrent scsi_device_quiesce() calls are * not allowed. */ WARN_ON_ONCE(sdev->quiesced_by && sdev->quiesced_by != current); if (sdev->quiesced_by == current) return 0; blk_set_pm_only(q); blk_mq_freeze_queue(q); /* * Ensure that the effect of blk_set_pm_only() will be visible * for percpu_ref_tryget() callers that occur after the queue * unfreeze even if the queue was already frozen before this function * was called. See also https://lwn.net/Articles/573497/. */ synchronize_rcu(); blk_mq_unfreeze_queue(q); mutex_lock(&sdev->state_mutex); err = scsi_device_set_state(sdev, SDEV_QUIESCE); if (err == 0) sdev->quiesced_by = current; else blk_clear_pm_only(q); mutex_unlock(&sdev->state_mutex); return err; } EXPORT_SYMBOL(scsi_device_quiesce); /** * scsi_device_resume - Restart user issued commands to a quiesced device. * @sdev: scsi device to resume. * * Moves the device from quiesced back to running and restarts the * queues. * * Must be called with user context, may sleep. */ void scsi_device_resume(struct scsi_device *sdev) { /* check if the device state was mutated prior to resume, and if * so assume the state is being managed elsewhere (for example * device deleted during suspend) */ mutex_lock(&sdev->state_mutex); if (sdev->sdev_state == SDEV_QUIESCE) scsi_device_set_state(sdev, SDEV_RUNNING); if (sdev->quiesced_by) { sdev->quiesced_by = NULL; blk_clear_pm_only(sdev->request_queue); } mutex_unlock(&sdev->state_mutex); } EXPORT_SYMBOL(scsi_device_resume); static void device_quiesce_fn(struct scsi_device *sdev, void *data) { scsi_device_quiesce(sdev); } void scsi_target_quiesce(struct scsi_target *starget) { starget_for_each_device(starget, NULL, device_quiesce_fn); } EXPORT_SYMBOL(scsi_target_quiesce); static void device_resume_fn(struct scsi_device *sdev, void *data) { scsi_device_resume(sdev); } void scsi_target_resume(struct scsi_target *starget) { starget_for_each_device(starget, NULL, device_resume_fn); } EXPORT_SYMBOL(scsi_target_resume); /** * scsi_internal_device_block_nowait - try to transition to the SDEV_BLOCK state * @sdev: device to block * * Pause SCSI command processing on the specified device. Does not sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_BLOCK state (which must be * a legal transition). When the device is in this state, command processing * is paused until the device leaves the SDEV_BLOCK state. See also * scsi_internal_device_unblock_nowait(). */ int scsi_internal_device_block_nowait(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; int err = 0; err = scsi_device_set_state(sdev, SDEV_BLOCK); if (err) { err = scsi_device_set_state(sdev, SDEV_CREATED_BLOCK); if (err) return err; } /* * The device has transitioned to SDEV_BLOCK. Stop the * block layer from calling the midlayer with this device's * request queue. */ blk_mq_quiesce_queue_nowait(q); return 0; } EXPORT_SYMBOL_GPL(scsi_internal_device_block_nowait); /** * scsi_internal_device_block - try to transition to the SDEV_BLOCK state * @sdev: device to block * * Pause SCSI command processing on the specified device and wait until all * ongoing scsi_request_fn() / scsi_queue_rq() calls have finished. May sleep. * * Returns zero if successful or a negative error code upon failure. * * Note: * This routine transitions the device to the SDEV_BLOCK state (which must be * a legal transition). When the device is in this state, command processing * is paused until the device leaves the SDEV_BLOCK state. See also * scsi_internal_device_unblock(). */ static int scsi_internal_device_block(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; int err; mutex_lock(&sdev->state_mutex); err = scsi_internal_device_block_nowait(sdev); if (err == 0) blk_mq_quiesce_queue(q); mutex_unlock(&sdev->state_mutex); return err; } void scsi_start_queue(struct scsi_device *sdev) { struct request_queue *q = sdev->request_queue; blk_mq_unquiesce_queue(q); } /** * scsi_internal_device_unblock_nowait - resume a device after a block request * @sdev: device to resume * @new_state: state to set the device to after unblocking * * Restart the device queue for a previously suspended SCSI device. Does not * sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_RUNNING state or to one of * the offline states (which must be a legal transition) allowing the midlayer * to goose the queue for this device. */ int scsi_internal_device_unblock_nowait(struct scsi_device *sdev, enum scsi_device_state new_state) { switch (new_state) { case SDEV_RUNNING: case SDEV_TRANSPORT_OFFLINE: break; default: return -EINVAL; } /* * Try to transition the scsi device to SDEV_RUNNING or one of the * offlined states and goose the device queue if successful. */ switch (sdev->sdev_state) { case SDEV_BLOCK: case SDEV_TRANSPORT_OFFLINE: sdev->sdev_state = new_state; break; case SDEV_CREATED_BLOCK: if (new_state == SDEV_TRANSPORT_OFFLINE || new_state == SDEV_OFFLINE) sdev->sdev_state = new_state; else sdev->sdev_state = SDEV_CREATED; break; case SDEV_CANCEL: case SDEV_OFFLINE: break; default: return -EINVAL; } scsi_start_queue(sdev); return 0; } EXPORT_SYMBOL_GPL(scsi_internal_device_unblock_nowait); /** * scsi_internal_device_unblock - resume a device after a block request * @sdev: device to resume * @new_state: state to set the device to after unblocking * * Restart the device queue for a previously suspended SCSI device. May sleep. * * Returns zero if successful or a negative error code upon failure. * * Notes: * This routine transitions the device to the SDEV_RUNNING state or to one of * the offline states (which must be a legal transition) allowing the midlayer * to goose the queue for this device. */ static int scsi_internal_device_unblock(struct scsi_device *sdev, enum scsi_device_state new_state) { int ret; mutex_lock(&sdev->state_mutex); ret = scsi_internal_device_unblock_nowait(sdev, new_state); mutex_unlock(&sdev->state_mutex); return ret; } static void device_block(struct scsi_device *sdev, void *data) { int ret; ret = scsi_internal_device_block(sdev); WARN_ONCE(ret, "scsi_internal_device_block(%s) failed: ret = %d\n", dev_name(&sdev->sdev_gendev), ret); } static int target_block(struct device *dev, void *data) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), NULL, device_block); return 0; } void scsi_target_block(struct device *dev) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), NULL, device_block); else device_for_each_child(dev, NULL, target_block); } EXPORT_SYMBOL_GPL(scsi_target_block); static void device_unblock(struct scsi_device *sdev, void *data) { scsi_internal_device_unblock(sdev, *(enum scsi_device_state *)data); } static int target_unblock(struct device *dev, void *data) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), data, device_unblock); return 0; } void scsi_target_unblock(struct device *dev, enum scsi_device_state new_state) { if (scsi_is_target_device(dev)) starget_for_each_device(to_scsi_target(dev), &new_state, device_unblock); else device_for_each_child(dev, &new_state, target_unblock); } EXPORT_SYMBOL_GPL(scsi_target_unblock); int scsi_host_block(struct Scsi_Host *shost) { struct scsi_device *sdev; int ret = 0; /* * Call scsi_internal_device_block_nowait so we can avoid * calling synchronize_rcu() for each LUN. */ shost_for_each_device(sdev, shost) { mutex_lock(&sdev->state_mutex); ret = scsi_internal_device_block_nowait(sdev); mutex_unlock(&sdev->state_mutex); if (ret) { scsi_device_put(sdev); break; } } /* * SCSI never enables blk-mq's BLK_MQ_F_BLOCKING flag so * calling synchronize_rcu() once is enough. */ WARN_ON_ONCE(shost->tag_set.flags & BLK_MQ_F_BLOCKING); if (!ret) synchronize_rcu(); return ret; } EXPORT_SYMBOL_GPL(scsi_host_block); int scsi_host_unblock(struct Scsi_Host *shost, int new_state) { struct scsi_device *sdev; int ret = 0; shost_for_each_device(sdev, shost) { ret = scsi_internal_device_unblock(sdev, new_state); if (ret) { scsi_device_put(sdev); break; } } return ret; } EXPORT_SYMBOL_GPL(scsi_host_unblock); /** * scsi_kmap_atomic_sg - find and atomically map an sg-elemnt * @sgl: scatter-gather list * @sg_count: number of segments in sg * @offset: offset in bytes into sg, on return offset into the mapped area * @len: bytes to map, on return number of bytes mapped * * Returns virtual address of the start of the mapped page */ void *scsi_kmap_atomic_sg(struct scatterlist *sgl, int sg_count, size_t *offset, size_t *len) { int i; size_t sg_len = 0, len_complete = 0; struct scatterlist *sg; struct page *page; WARN_ON(!irqs_disabled()); for_each_sg(sgl, sg, sg_count, i) { len_complete = sg_len; /* Complete sg-entries */ sg_len += sg->length; if (sg_len > *offset) break; } if (unlikely(i == sg_count)) { printk(KERN_ERR "%s: Bytes in sg: %zu, requested offset %zu, " "elements %d\n", __func__, sg_len, *offset, sg_count); WARN_ON(1); return NULL; } /* Offset starting from the beginning of first page in this sg-entry */ *offset = *offset - len_complete + sg->offset; /* Assumption: contiguous pages can be accessed as "page + i" */ page = nth_page(sg_page(sg), (*offset >> PAGE_SHIFT)); *offset &= ~PAGE_MASK; /* Bytes in this sg-entry from *offset to the end of the page */ sg_len = PAGE_SIZE - *offset; if (*len > sg_len) *len = sg_len; return kmap_atomic(page); } EXPORT_SYMBOL(scsi_kmap_atomic_sg); /** * scsi_kunmap_atomic_sg - atomically unmap a virtual address, previously mapped with scsi_kmap_atomic_sg * @virt: virtual address to be unmapped */ void scsi_kunmap_atomic_sg(void *virt) { kunmap_atomic(virt); } EXPORT_SYMBOL(scsi_kunmap_atomic_sg); void sdev_disable_disk_events(struct scsi_device *sdev) { atomic_inc(&sdev->disk_events_disable_depth); } EXPORT_SYMBOL(sdev_disable_disk_events); void sdev_enable_disk_events(struct scsi_device *sdev) { if (WARN_ON_ONCE(atomic_read(&sdev->disk_events_disable_depth) <= 0)) return; atomic_dec(&sdev->disk_events_disable_depth); } EXPORT_SYMBOL(sdev_enable_disk_events); static unsigned char designator_prio(const unsigned char *d) { if (d[1] & 0x30) /* not associated with LUN */ return 0; if (d[3] == 0) /* invalid length */ return 0; /* * Order of preference for lun descriptor: * - SCSI name string * - NAA IEEE Registered Extended * - EUI-64 based 16-byte * - EUI-64 based 12-byte * - NAA IEEE Registered * - NAA IEEE Extended * - EUI-64 based 8-byte * - SCSI name string (truncated) * - T10 Vendor ID * as longer descriptors reduce the likelyhood * of identification clashes. */ switch (d[1] & 0xf) { case 8: /* SCSI name string, variable-length UTF-8 */ return 9; case 3: switch (d[4] >> 4) { case 6: /* NAA registered extended */ return 8; case 5: /* NAA registered */ return 5; case 4: /* NAA extended */ return 4; case 3: /* NAA locally assigned */ return 1; default: break; } break; case 2: switch (d[3]) { case 16: /* EUI64-based, 16 byte */ return 7; case 12: /* EUI64-based, 12 byte */ return 6; case 8: /* EUI64-based, 8 byte */ return 3; default: break; } break; case 1: /* T10 vendor ID */ return 1; default: break; } return 0; } /** * scsi_vpd_lun_id - return a unique device identification * @sdev: SCSI device * @id: buffer for the identification * @id_len: length of the buffer * * Copies a unique device identification into @id based * on the information in the VPD page 0x83 of the device. * The string will be formatted as a SCSI name string. * * Returns the length of the identification or error on failure. * If the identifier is longer than the supplied buffer the actual * identifier length is returned and the buffer is not zero-padded. */ int scsi_vpd_lun_id(struct scsi_device *sdev, char *id, size_t id_len) { u8 cur_id_prio = 0; u8 cur_id_size = 0; const unsigned char *d, *cur_id_str; const struct scsi_vpd *vpd_pg83; int id_size = -EINVAL; rcu_read_lock(); vpd_pg83 = rcu_dereference(sdev->vpd_pg83); if (!vpd_pg83) { rcu_read_unlock(); return -ENXIO; } /* The id string must be at least 20 bytes + terminating NULL byte */ if (id_len < 21) { rcu_read_unlock(); return -EINVAL; } memset(id, 0, id_len); d = vpd_pg83->data + 4; while (d < vpd_pg83->data + vpd_pg83->len) { u8 prio = designator_prio(d); if (prio == 0 || cur_id_prio > prio) goto next_desig; switch (d[1] & 0xf) { case 0x1: /* T10 Vendor ID */ if (cur_id_size > d[3]) break; cur_id_prio = prio; cur_id_size = d[3]; if (cur_id_size + 4 > id_len) cur_id_size = id_len - 4; cur_id_str = d + 4; id_size = snprintf(id, id_len, "t10.%*pE", cur_id_size, cur_id_str); break; case 0x2: /* EUI-64 */ cur_id_prio = prio; cur_id_size = d[3]; cur_id_str = d + 4; switch (cur_id_size) { case 8: id_size = snprintf(id, id_len, "eui.%8phN", cur_id_str); break; case 12: id_size = snprintf(id, id_len, "eui.%12phN", cur_id_str); break; case 16: id_size = snprintf(id, id_len, "eui.%16phN", cur_id_str); break; default: break; } break; case 0x3: /* NAA */ cur_id_prio = prio; cur_id_size = d[3]; cur_id_str = d + 4; switch (cur_id_size) { case 8: id_size = snprintf(id, id_len, "naa.%8phN", cur_id_str); break; case 16: id_size = snprintf(id, id_len, "naa.%16phN", cur_id_str); break; default: break; } break; case 0x8: /* SCSI name string */ if (cur_id_size > d[3]) break; /* Prefer others for truncated descriptor */ if (d[3] > id_len) { prio = 2; if (cur_id_prio > prio) break; } cur_id_prio = prio; cur_id_size = id_size = d[3]; cur_id_str = d + 4; if (cur_id_size >= id_len) cur_id_size = id_len - 1; memcpy(id, cur_id_str, cur_id_size); break; default: break; } next_desig: d += d[3] + 4; } rcu_read_unlock(); return id_size; } EXPORT_SYMBOL(scsi_vpd_lun_id); /* * scsi_vpd_tpg_id - return a target port group identifier * @sdev: SCSI device * * Returns the Target Port Group identifier from the information * froom VPD page 0x83 of the device. * * Returns the identifier or error on failure. */ int scsi_vpd_tpg_id(struct scsi_device *sdev, int *rel_id) { const unsigned char *d; const struct scsi_vpd *vpd_pg83; int group_id = -EAGAIN, rel_port = -1; rcu_read_lock(); vpd_pg83 = rcu_dereference(sdev->vpd_pg83); if (!vpd_pg83) { rcu_read_unlock(); return -ENXIO; } d = vpd_pg83->data + 4; while (d < vpd_pg83->data + vpd_pg83->len) { switch (d[1] & 0xf) { case 0x4: /* Relative target port */ rel_port = get_unaligned_be16(&d[6]); break; case 0x5: /* Target port group */ group_id = get_unaligned_be16(&d[6]); break; default: break; } d += d[3] + 4; } rcu_read_unlock(); if (group_id >= 0 && rel_id && rel_port != -1) *rel_id = rel_port; return group_id; } EXPORT_SYMBOL(scsi_vpd_tpg_id);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM cgroup #if !defined(_TRACE_CGROUP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_CGROUP_H #include <linux/cgroup.h> #include <linux/tracepoint.h> DECLARE_EVENT_CLASS(cgroup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root), TP_STRUCT__entry( __field( int, root ) __field( u16, ss_mask ) __string( name, root->name ) ), TP_fast_assign( __entry->root = root->hierarchy_id; __entry->ss_mask = root->subsys_mask; __assign_str(name, root->name); ), TP_printk("root=%d ss_mask=%#x name=%s", __entry->root, __entry->ss_mask, __get_str(name)) ); DEFINE_EVENT(cgroup_root, cgroup_setup_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_destroy_root, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DEFINE_EVENT(cgroup_root, cgroup_remount, TP_PROTO(struct cgroup_root *root), TP_ARGS(root) ); DECLARE_EVENT_CLASS(cgroup, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path), TP_STRUCT__entry( __field( int, root ) __field( int, id ) __field( int, level ) __string( path, path ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); ), TP_printk("root=%d id=%d level=%d path=%s", __entry->root, __entry->id, __entry->level, __get_str(path)) ); DEFINE_EVENT(cgroup, cgroup_mkdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rmdir, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_release, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_rename, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_freeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DEFINE_EVENT(cgroup, cgroup_unfreeze, TP_PROTO(struct cgroup *cgrp, const char *path), TP_ARGS(cgrp, path) ); DECLARE_EVENT_CLASS(cgroup_migrate, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup), TP_STRUCT__entry( __field( int, dst_root ) __field( int, dst_id ) __field( int, dst_level ) __field( int, pid ) __string( dst_path, path ) __string( comm, task->comm ) ), TP_fast_assign( __entry->dst_root = dst_cgrp->root->hierarchy_id; __entry->dst_id = cgroup_id(dst_cgrp); __entry->dst_level = dst_cgrp->level; __assign_str(dst_path, path); __entry->pid = task->pid; __assign_str(comm, task->comm); ), TP_printk("dst_root=%d dst_id=%d dst_level=%d dst_path=%s pid=%d comm=%s", __entry->dst_root, __entry->dst_id, __entry->dst_level, __get_str(dst_path), __entry->pid, __get_str(comm)) ); DEFINE_EVENT(cgroup_migrate, cgroup_attach_task, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DEFINE_EVENT(cgroup_migrate, cgroup_transfer_tasks, TP_PROTO(struct cgroup *dst_cgrp, const char *path, struct task_struct *task, bool threadgroup), TP_ARGS(dst_cgrp, path, task, threadgroup) ); DECLARE_EVENT_CLASS(cgroup_event, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val), TP_STRUCT__entry( __field( int, root ) __field( int, id ) __field( int, level ) __string( path, path ) __field( int, val ) ), TP_fast_assign( __entry->root = cgrp->root->hierarchy_id; __entry->id = cgroup_id(cgrp); __entry->level = cgrp->level; __assign_str(path, path); __entry->val = val; ), TP_printk("root=%d id=%d level=%d path=%s val=%d", __entry->root, __entry->id, __entry->level, __get_str(path), __entry->val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_populated, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); DEFINE_EVENT(cgroup_event, cgroup_notify_frozen, TP_PROTO(struct cgroup *cgrp, const char *path, int val), TP_ARGS(cgrp, path, val) ); #endif /* _TRACE_CGROUP_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 /* SPDX-License-Identifier: GPL-2.0-only */ /* * net busy poll support * Copyright(c) 2013 Intel Corporation. * * Author: Eliezer Tamir * * Contact Information: * e1000-devel Mailing List <e1000-devel@lists.sourceforge.net> */ #ifndef _LINUX_NET_BUSY_POLL_H #define _LINUX_NET_BUSY_POLL_H #include <linux/netdevice.h> #include <linux/sched/clock.h> #include <linux/sched/signal.h> #include <net/ip.h> /* 0 - Reserved to indicate value not set * 1..NR_CPUS - Reserved for sender_cpu * NR_CPUS+1..~0 - Region available for NAPI IDs */ #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) #ifdef CONFIG_NET_RX_BUSY_POLL struct napi_struct; extern unsigned int sysctl_net_busy_read __read_mostly; extern unsigned int sysctl_net_busy_poll __read_mostly; static inline bool net_busy_loop_on(void) { return sysctl_net_busy_poll; } static inline bool sk_can_busy_loop(const struct sock *sk) { return READ_ONCE(sk->sk_ll_usec) && !signal_pending(current); } bool sk_busy_loop_end(void *p, unsigned long start_time); void napi_busy_loop(unsigned int napi_id, bool (*loop_end)(void *, unsigned long), void *loop_end_arg); #else /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long net_busy_loop_on(void) { return 0; } static inline bool sk_can_busy_loop(struct sock *sk) { return false; } #endif /* CONFIG_NET_RX_BUSY_POLL */ static inline unsigned long busy_loop_current_time(void) { #ifdef CONFIG_NET_RX_BUSY_POLL return (unsigned long)(local_clock() >> 10); #else return 0; #endif } /* in poll/select we use the global sysctl_net_ll_poll value */ static inline bool busy_loop_timeout(unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline bool sk_busy_loop_timeout(struct sock *sk, unsigned long start_time) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned long bp_usec = READ_ONCE(sk->sk_ll_usec); if (bp_usec) { unsigned long end_time = start_time + bp_usec; unsigned long now = busy_loop_current_time(); return time_after(now, end_time); } #endif return true; } static inline void sk_busy_loop(struct sock *sk, int nonblock) { #ifdef CONFIG_NET_RX_BUSY_POLL unsigned int napi_id = READ_ONCE(sk->sk_napi_id); if (napi_id >= MIN_NAPI_ID) napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk); #endif } /* used in the NIC receive handler to mark the skb */ static inline void skb_mark_napi_id(struct sk_buff *skb, struct napi_struct *napi) { #ifdef CONFIG_NET_RX_BUSY_POLL /* If the skb was already marked with a valid NAPI ID, avoid overwriting * it. */ if (skb->napi_id < MIN_NAPI_ID) skb->napi_id = napi->napi_id; #endif } /* used in the protocol hanlder to propagate the napi_id to the socket */ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif sk_rx_queue_set(sk, skb); } /* variant used for unconnected sockets */ static inline void sk_mark_napi_id_once(struct sock *sk, const struct sk_buff *skb) { #ifdef CONFIG_NET_RX_BUSY_POLL if (!READ_ONCE(sk->sk_napi_id)) WRITE_ONCE(sk->sk_napi_id, skb->napi_id); #endif } #endif /* _LINUX_NET_BUSY_POLL_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Internal procfs definitions * * Copyright (C) 2004 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) */ #include <linux/proc_fs.h> #include <linux/proc_ns.h> #include <linux/refcount.h> #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/binfmts.h> #include <linux/sched/coredump.h> #include <linux/sched/task.h> struct ctl_table_header; struct mempolicy; /* * This is not completely implemented yet. The idea is to * create an in-memory tree (like the actual /proc filesystem * tree) of these proc_dir_entries, so that we can dynamically * add new files to /proc. * * parent/subdir are used for the directory structure (every /proc file has a * parent, but "subdir" is empty for all non-directory entries). * subdir_node is used to build the rb tree "subdir" of the parent. */ struct proc_dir_entry { /* * number of callers into module in progress; * negative -> it's going away RSN */ atomic_t in_use; refcount_t refcnt; struct list_head pde_openers; /* who did ->open, but not ->release */ /* protects ->pde_openers and all struct pde_opener instances */ spinlock_t pde_unload_lock; struct completion *pde_unload_completion; const struct inode_operations *proc_iops; union { const struct proc_ops *proc_ops; const struct file_operations *proc_dir_ops; }; const struct dentry_operations *proc_dops; union { const struct seq_operations *seq_ops; int (*single_show)(struct seq_file *, void *); }; proc_write_t write; void *data; unsigned int state_size; unsigned int low_ino; nlink_t nlink; kuid_t uid; kgid_t gid; loff_t size; struct proc_dir_entry *parent; struct rb_root subdir; struct rb_node subdir_node; char *name; umode_t mode; u8 flags; u8 namelen; char inline_name[]; } __randomize_layout; #define SIZEOF_PDE ( \ sizeof(struct proc_dir_entry) < 128 ? 128 : \ sizeof(struct proc_dir_entry) < 192 ? 192 : \ sizeof(struct proc_dir_entry) < 256 ? 256 : \ sizeof(struct proc_dir_entry) < 512 ? 512 : \ 0) #define SIZEOF_PDE_INLINE_NAME (SIZEOF_PDE - sizeof(struct proc_dir_entry)) static inline bool pde_is_permanent(const struct proc_dir_entry *pde) { return pde->flags & PROC_ENTRY_PERMANENT; } extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); union proc_op { int (*proc_get_link)(struct dentry *, struct path *); int (*proc_show)(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); const char *lsm; }; struct proc_inode { struct pid *pid; unsigned int fd; union proc_op op; struct proc_dir_entry *pde; struct ctl_table_header *sysctl; struct ctl_table *sysctl_entry; struct hlist_node sibling_inodes; const struct proc_ns_operations *ns_ops; struct inode vfs_inode; } __randomize_layout; /* * General functions */ static inline struct proc_inode *PROC_I(const struct inode *inode) { return container_of(inode, struct proc_inode, vfs_inode); } static inline struct proc_dir_entry *PDE(const struct inode *inode) { return PROC_I(inode)->pde; } static inline void *__PDE_DATA(const struct inode *inode) { return PDE(inode)->data; } static inline struct pid *proc_pid(const struct inode *inode) { return PROC_I(inode)->pid; } static inline struct task_struct *get_proc_task(const struct inode *inode) { return get_pid_task(proc_pid(inode), PIDTYPE_PID); } void task_dump_owner(struct task_struct *task, umode_t mode, kuid_t *ruid, kgid_t *rgid); unsigned name_to_int(const struct qstr *qstr); /* * Offset of the first process in the /proc root directory.. */ #define FIRST_PROCESS_ENTRY 256 /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 /* * array.c */ extern const struct file_operations proc_tid_children_operations; extern void proc_task_name(struct seq_file *m, struct task_struct *p, bool escape); extern int proc_tid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_tgid_stat(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_status(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); extern int proc_pid_statm(struct seq_file *, struct pid_namespace *, struct pid *, struct task_struct *); /* * base.c */ extern const struct dentry_operations pid_dentry_operations; extern int pid_getattr(const struct path *, struct kstat *, u32, unsigned int); extern int proc_setattr(struct dentry *, struct iattr *); extern void proc_pid_evict_inode(struct proc_inode *); extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t); extern void pid_update_inode(struct task_struct *, struct inode *); extern int pid_delete_dentry(const struct dentry *); extern int proc_pid_readdir(struct file *, struct dir_context *); struct dentry *proc_pid_lookup(struct dentry *, unsigned int); extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* * generic.c */ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, struct proc_dir_entry **parent, void *data); struct proc_dir_entry *proc_register(struct proc_dir_entry *dir, struct proc_dir_entry *dp); extern struct dentry *proc_lookup(struct inode *, struct dentry *, unsigned int); struct dentry *proc_lookup_de(struct inode *, struct dentry *, struct proc_dir_entry *); extern int proc_readdir(struct file *, struct dir_context *); int proc_readdir_de(struct file *, struct dir_context *, struct proc_dir_entry *); static inline struct proc_dir_entry *pde_get(struct proc_dir_entry *pde) { refcount_inc(&pde->refcnt); return pde; } extern void pde_put(struct proc_dir_entry *); static inline bool is_empty_pde(const struct proc_dir_entry *pde) { return S_ISDIR(pde->mode) && !pde->proc_iops; } extern ssize_t proc_simple_write(struct file *, const char __user *, size_t, loff_t *); /* * inode.c */ struct pde_opener { struct list_head lh; struct file *file; bool closing; struct completion *c; } __randomize_layout; extern const struct inode_operations proc_link_inode_operations; extern const struct inode_operations proc_pid_link_inode_operations; extern const struct super_operations proc_sops; void proc_init_kmemcache(void); void proc_invalidate_siblings_dcache(struct hlist_head *inodes, spinlock_t *lock); void set_proc_pid_nlink(void); extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *); extern void proc_entry_rundown(struct proc_dir_entry *); /* * proc_namespaces.c */ extern const struct inode_operations proc_ns_dir_inode_operations; extern const struct file_operations proc_ns_dir_operations; /* * proc_net.c */ extern const struct file_operations proc_net_operations; extern const struct inode_operations proc_net_inode_operations; #ifdef CONFIG_NET extern int proc_net_init(void); #else static inline int proc_net_init(void) { return 0; } #endif /* * proc_self.c */ extern int proc_setup_self(struct super_block *); /* * proc_thread_self.c */ extern int proc_setup_thread_self(struct super_block *); extern void proc_thread_self_init(void); /* * proc_sysctl.c */ #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); extern void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head); #else static inline void proc_sys_init(void) { } static inline void proc_sys_evict_inode(struct inode *inode, struct ctl_table_header *head) { } #endif /* * proc_tty.c */ #ifdef CONFIG_TTY extern void proc_tty_init(void); #else static inline void proc_tty_init(void) {} #endif /* * root.c */ extern struct proc_dir_entry proc_root; extern void proc_self_init(void); /* * task_[no]mmu.c */ struct mem_size_stats; struct proc_maps_private { struct inode *inode; struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; #endif } __randomize_layout; struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; extern const struct file_operations proc_pid_smaps_operations; extern const struct file_operations proc_pid_smaps_rollup_operations; extern const struct file_operations proc_clear_refs_operations; extern const struct file_operations proc_pagemap_operations; extern unsigned long task_vsize(struct mm_struct *); extern unsigned long task_statm(struct mm_struct *, unsigned long *, unsigned long *, unsigned long *, unsigned long *); extern void task_mem(struct seq_file *, struct mm_struct *); extern const struct dentry_operations proc_net_dentry_ops; static inline void pde_force_lookup(struct proc_dir_entry *pde) { /* /proc/net/ entries can be changed under us by setns(CLONE_NEWNET) */ pde->proc_dops = &proc_net_dentry_ops; }
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM xdp #if !defined(_TRACE_XDP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_XDP_H #include <linux/netdevice.h> #include <linux/filter.h> #include <linux/tracepoint.h> #include <linux/bpf.h> #define __XDP_ACT_MAP(FN) \ FN(ABORTED) \ FN(DROP) \ FN(PASS) \ FN(TX) \ FN(REDIRECT) #define __XDP_ACT_TP_FN(x) \ TRACE_DEFINE_ENUM(XDP_##x); #define __XDP_ACT_SYM_FN(x) \ { XDP_##x, #x }, #define __XDP_ACT_SYM_TAB \ __XDP_ACT_MAP(__XDP_ACT_SYM_FN) { -1, NULL } __XDP_ACT_MAP(__XDP_ACT_TP_FN) TRACE_EVENT(xdp_exception, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, u32 act), TP_ARGS(dev, xdp, act), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = act; __entry->ifindex = dev->ifindex; ), TP_printk("prog_id=%d action=%s ifindex=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex) ); TRACE_EVENT(xdp_bulk_tx, TP_PROTO(const struct net_device *dev, int sent, int drops, int err), TP_ARGS(dev, sent, drops, err), TP_STRUCT__entry( __field(int, ifindex) __field(u32, act) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->ifindex = dev->ifindex; __entry->act = XDP_TX; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ifindex=%d action=%s sent=%d drops=%d err=%d", __entry->ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); #ifndef __DEVMAP_OBJ_TYPE #define __DEVMAP_OBJ_TYPE struct _bpf_dtab_netdev { struct net_device *dev; }; #endif /* __DEVMAP_OBJ_TYPE */ #define devmap_ifindex(tgt, map) \ (((map->map_type == BPF_MAP_TYPE_DEVMAP || \ map->map_type == BPF_MAP_TYPE_DEVMAP_HASH)) ? \ ((struct _bpf_dtab_netdev *)tgt)->dev->ifindex : 0) DECLARE_EVENT_CLASS(xdp_redirect_template, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, const struct bpf_map *map, u32 index), TP_ARGS(dev, xdp, tgt, err, map, index), TP_STRUCT__entry( __field(int, prog_id) __field(u32, act) __field(int, ifindex) __field(int, err) __field(int, to_ifindex) __field(u32, map_id) __field(int, map_index) ), TP_fast_assign( __entry->prog_id = xdp->aux->id; __entry->act = XDP_REDIRECT; __entry->ifindex = dev->ifindex; __entry->err = err; __entry->to_ifindex = map ? devmap_ifindex(tgt, map) : index; __entry->map_id = map ? map->id : 0; __entry->map_index = map ? index : 0; ), TP_printk("prog_id=%d action=%s ifindex=%d to_ifindex=%d err=%d" " map_id=%d map_index=%d", __entry->prog_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->ifindex, __entry->to_ifindex, __entry->err, __entry->map_id, __entry->map_index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, const struct bpf_map *map, u32 index), TP_ARGS(dev, xdp, tgt, err, map, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, const struct bpf_map *map, u32 index), TP_ARGS(dev, xdp, tgt, err, map, index) ); #define _trace_xdp_redirect(dev, xdp, to) \ trace_xdp_redirect(dev, xdp, NULL, 0, NULL, to); #define _trace_xdp_redirect_err(dev, xdp, to, err) \ trace_xdp_redirect_err(dev, xdp, NULL, err, NULL, to); #define _trace_xdp_redirect_map(dev, xdp, to, map, index) \ trace_xdp_redirect(dev, xdp, to, 0, map, index); #define _trace_xdp_redirect_map_err(dev, xdp, to, map, index, err) \ trace_xdp_redirect_err(dev, xdp, to, err, map, index); /* not used anymore, but kept around so as not to break old programs */ DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, const struct bpf_map *map, u32 index), TP_ARGS(dev, xdp, tgt, err, map, index) ); DEFINE_EVENT(xdp_redirect_template, xdp_redirect_map_err, TP_PROTO(const struct net_device *dev, const struct bpf_prog *xdp, const void *tgt, int err, const struct bpf_map *map, u32 index), TP_ARGS(dev, xdp, tgt, err, map, index) ); TRACE_EVENT(xdp_cpumap_kthread, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int sched, struct xdp_cpumap_stats *xdp_stats), TP_ARGS(map_id, processed, drops, sched, xdp_stats), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, sched) __field(unsigned int, xdp_pass) __field(unsigned int, xdp_drop) __field(unsigned int, xdp_redirect) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->sched = sched; __entry->xdp_pass = xdp_stats->pass; __entry->xdp_drop = xdp_stats->drop; __entry->xdp_redirect = xdp_stats->redirect; ), TP_printk("kthread" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " sched=%d" " xdp_pass=%u xdp_drop=%u xdp_redirect=%u", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->sched, __entry->xdp_pass, __entry->xdp_drop, __entry->xdp_redirect) ); TRACE_EVENT(xdp_cpumap_enqueue, TP_PROTO(int map_id, unsigned int processed, unsigned int drops, int to_cpu), TP_ARGS(map_id, processed, drops, to_cpu), TP_STRUCT__entry( __field(int, map_id) __field(u32, act) __field(int, cpu) __field(unsigned int, drops) __field(unsigned int, processed) __field(int, to_cpu) ), TP_fast_assign( __entry->map_id = map_id; __entry->act = XDP_REDIRECT; __entry->cpu = smp_processor_id(); __entry->drops = drops; __entry->processed = processed; __entry->to_cpu = to_cpu; ), TP_printk("enqueue" " cpu=%d map_id=%d action=%s" " processed=%u drops=%u" " to_cpu=%d", __entry->cpu, __entry->map_id, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->processed, __entry->drops, __entry->to_cpu) ); TRACE_EVENT(xdp_devmap_xmit, TP_PROTO(const struct net_device *from_dev, const struct net_device *to_dev, int sent, int drops, int err), TP_ARGS(from_dev, to_dev, sent, drops, err), TP_STRUCT__entry( __field(int, from_ifindex) __field(u32, act) __field(int, to_ifindex) __field(int, drops) __field(int, sent) __field(int, err) ), TP_fast_assign( __entry->from_ifindex = from_dev->ifindex; __entry->act = XDP_REDIRECT; __entry->to_ifindex = to_dev->ifindex; __entry->drops = drops; __entry->sent = sent; __entry->err = err; ), TP_printk("ndo_xdp_xmit" " from_ifindex=%d to_ifindex=%d action=%s" " sent=%d drops=%d" " err=%d", __entry->from_ifindex, __entry->to_ifindex, __print_symbolic(__entry->act, __XDP_ACT_SYM_TAB), __entry->sent, __entry->drops, __entry->err) ); /* Expect users already include <net/xdp.h>, but not xdp_priv.h */ #include <net/xdp_priv.h> #define __MEM_TYPE_MAP(FN) \ FN(PAGE_SHARED) \ FN(PAGE_ORDER0) \ FN(PAGE_POOL) \ FN(XSK_BUFF_POOL) #define __MEM_TYPE_TP_FN(x) \ TRACE_DEFINE_ENUM(MEM_TYPE_##x); #define __MEM_TYPE_SYM_FN(x) \ { MEM_TYPE_##x, #x }, #define __MEM_TYPE_SYM_TAB \ __MEM_TYPE_MAP(__MEM_TYPE_SYM_FN) { -1, 0 } __MEM_TYPE_MAP(__MEM_TYPE_TP_FN) TRACE_EVENT(mem_disconnect, TP_PROTO(const struct xdp_mem_allocator *xa), TP_ARGS(xa), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; ), TP_printk("mem_id=%d mem_type=%s allocator=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator ) ); TRACE_EVENT(mem_connect, TP_PROTO(const struct xdp_mem_allocator *xa, const struct xdp_rxq_info *rxq), TP_ARGS(xa, rxq), TP_STRUCT__entry( __field(const struct xdp_mem_allocator *, xa) __field(u32, mem_id) __field(u32, mem_type) __field(const void *, allocator) __field(const struct xdp_rxq_info *, rxq) __field(int, ifindex) ), TP_fast_assign( __entry->xa = xa; __entry->mem_id = xa->mem.id; __entry->mem_type = xa->mem.type; __entry->allocator = xa->allocator; __entry->rxq = rxq; __entry->ifindex = rxq->dev->ifindex; ), TP_printk("mem_id=%d mem_type=%s allocator=%p" " ifindex=%d", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->allocator, __entry->ifindex ) ); TRACE_EVENT(mem_return_failed, TP_PROTO(const struct xdp_mem_info *mem, const struct page *page), TP_ARGS(mem, page), TP_STRUCT__entry( __field(const struct page *, page) __field(u32, mem_id) __field(u32, mem_type) ), TP_fast_assign( __entry->page = page; __entry->mem_id = mem->id; __entry->mem_type = mem->type; ), TP_printk("mem_id=%d mem_type=%s page=%p", __entry->mem_id, __print_symbolic(__entry->mem_type, __MEM_TYPE_SYM_TAB), __entry->page ) ); #endif /* _TRACE_XDP_H */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * AEAD: Authenticated Encryption with Associated Data * * Copyright (c) 2007-2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_INTERNAL_AEAD_H #define _CRYPTO_INTERNAL_AEAD_H #include <crypto/aead.h> #include <crypto/algapi.h> #include <linux/stddef.h> #include <linux/types.h> struct rtattr; struct aead_instance { void (*free)(struct aead_instance *inst); union { struct { char head[offsetof(struct aead_alg, base)]; struct crypto_instance base; } s; struct aead_alg alg; }; }; struct crypto_aead_spawn { struct crypto_spawn base; }; struct aead_queue { struct crypto_queue base; }; static inline void *crypto_aead_ctx(struct crypto_aead *tfm) { return crypto_tfm_ctx(&tfm->base); } static inline struct crypto_instance *aead_crypto_instance( struct aead_instance *inst) { return container_of(&inst->alg.base, struct crypto_instance, alg); } static inline struct aead_instance *aead_instance(struct crypto_instance *inst) { return container_of(&inst->alg, struct aead_instance, alg.base); } static inline struct aead_instance *aead_alg_instance(struct crypto_aead *aead) { return aead_instance(crypto_tfm_alg_instance(&aead->base)); } static inline void *aead_instance_ctx(struct aead_instance *inst) { return crypto_instance_ctx(aead_crypto_instance(inst)); } static inline void *aead_request_ctx(struct aead_request *req) { return req->__ctx; } static inline void aead_request_complete(struct aead_request *req, int err) { req->base.complete(&req->base, err); } static inline u32 aead_request_flags(struct aead_request *req) { return req->base.flags; } static inline struct aead_request *aead_request_cast( struct crypto_async_request *req) { return container_of(req, struct aead_request, base); } int crypto_grab_aead(struct crypto_aead_spawn *spawn, struct crypto_instance *inst, const char *name, u32 type, u32 mask); static inline void crypto_drop_aead(struct crypto_aead_spawn *spawn) { crypto_drop_spawn(&spawn->base); } static inline struct aead_alg *crypto_spawn_aead_alg( struct crypto_aead_spawn *spawn) { return container_of(spawn->base.alg, struct aead_alg, base); } static inline struct crypto_aead *crypto_spawn_aead( struct crypto_aead_spawn *spawn) { return crypto_spawn_tfm2(&spawn->base); } static inline void crypto_aead_set_reqsize(struct crypto_aead *aead, unsigned int reqsize) { aead->reqsize = reqsize; } static inline void aead_init_queue(struct aead_queue *queue, unsigned int max_qlen) { crypto_init_queue(&queue->base, max_qlen); } static inline int aead_enqueue_request(struct aead_queue *queue, struct aead_request *request) { return crypto_enqueue_request(&queue->base, &request->base); } static inline struct aead_request *aead_dequeue_request( struct aead_queue *queue) { struct crypto_async_request *req; req = crypto_dequeue_request(&queue->base); return req ? container_of(req, struct aead_request, base) : NULL; } static inline struct aead_request *aead_get_backlog(struct aead_queue *queue) { struct crypto_async_request *req; req = crypto_get_backlog(&queue->base); return req ? container_of(req, struct aead_request, base) : NULL; } static inline unsigned int crypto_aead_alg_chunksize(struct aead_alg *alg) { return alg->chunksize; } /** * crypto_aead_chunksize() - obtain chunk size * @tfm: cipher handle * * The block size is set to one for ciphers such as CCM. However, * you still need to provide incremental updates in multiples of * the underlying block size as the IV does not have sub-block * granularity. This is known in this API as the chunk size. * * Return: chunk size in bytes */ static inline unsigned int crypto_aead_chunksize(struct crypto_aead *tfm) { return crypto_aead_alg_chunksize(crypto_aead_alg(tfm)); } int crypto_register_aead(struct aead_alg *alg); void crypto_unregister_aead(struct aead_alg *alg); int crypto_register_aeads(struct aead_alg *algs, int count); void crypto_unregister_aeads(struct aead_alg *algs, int count); int aead_register_instance(struct crypto_template *tmpl, struct aead_instance *inst); #endif /* _CRYPTO_INTERNAL_AEAD_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_UACCESS_64_H #define _ASM_X86_UACCESS_64_H /* * User space memory access functions */ #include <linux/compiler.h> #include <linux/lockdep.h> #include <linux/kasan-checks.h> #include <asm/alternative.h> #include <asm/cpufeatures.h> #include <asm/page.h> /* * Copy To/From Userspace */ /* Handles exceptions in both to and from, but doesn't do access_ok */ __must_check unsigned long copy_user_enhanced_fast_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_string(void *to, const void *from, unsigned len); __must_check unsigned long copy_user_generic_unrolled(void *to, const void *from, unsigned len); static __always_inline __must_check unsigned long copy_user_generic(void *to, const void *from, unsigned len) { unsigned ret; /* * If CPU has ERMS feature, use copy_user_enhanced_fast_string. * Otherwise, if CPU has rep_good feature, use copy_user_generic_string. * Otherwise, use copy_user_generic_unrolled. */ alternative_call_2(copy_user_generic_unrolled, copy_user_generic_string, X86_FEATURE_REP_GOOD, copy_user_enhanced_fast_string, X86_FEATURE_ERMS, ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from), "=d" (len)), "1" (to), "2" (from), "3" (len) : "memory", "rcx", "r8", "r9", "r10", "r11"); return ret; } static __always_inline __must_check unsigned long raw_copy_from_user(void *dst, const void __user *src, unsigned long size) { return copy_user_generic(dst, (__force void *)src, size); } static __always_inline __must_check unsigned long raw_copy_to_user(void __user *dst, const void *src, unsigned long size) { return copy_user_generic((__force void *)dst, src, size); } static __always_inline __must_check unsigned long raw_copy_in_user(void __user *dst, const void __user *src, unsigned long size) { return copy_user_generic((__force void *)dst, (__force void *)src, size); } extern long __copy_user_nocache(void *dst, const void __user *src, unsigned size, int zerorest); extern long __copy_user_flushcache(void *dst, const void __user *src, unsigned size); extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); static inline int __copy_from_user_inatomic_nocache(void *dst, const void __user *src, unsigned size) { kasan_check_write(dst, size); return __copy_user_nocache(dst, src, size, 0); } static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) { kasan_check_write(dst, size); return __copy_user_flushcache(dst, src, size); } #endif /* _ASM_X86_UACCESS_64_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 /* SPDX-License-Identifier: GPL-2.0 */ /* * fs-verity: read-only file-based authenticity protection * * This header declares the interface between the fs/verity/ support layer and * filesystems that support fs-verity. * * Copyright 2019 Google LLC */ #ifndef _LINUX_FSVERITY_H #define _LINUX_FSVERITY_H #include <linux/fs.h> #include <uapi/linux/fsverity.h> /* Verity operations for filesystems */ struct fsverity_operations { /** * Begin enabling verity on the given file. * * @filp: a readonly file descriptor for the file * * The filesystem must do any needed filesystem-specific preparations * for enabling verity, e.g. evicting inline data. It also must return * -EBUSY if verity is already being enabled on the given file. * * i_rwsem is held for write. * * Return: 0 on success, -errno on failure */ int (*begin_enable_verity)(struct file *filp); /** * End enabling verity on the given file. * * @filp: a readonly file descriptor for the file * @desc: the verity descriptor to write, or NULL on failure * @desc_size: size of verity descriptor, or 0 on failure * @merkle_tree_size: total bytes the Merkle tree took up * * If desc == NULL, then enabling verity failed and the filesystem only * must do any necessary cleanups. Else, it must also store the given * verity descriptor to a fs-specific location associated with the inode * and do any fs-specific actions needed to mark the inode as a verity * inode, e.g. setting a bit in the on-disk inode. The filesystem is * also responsible for setting the S_VERITY flag in the VFS inode. * * i_rwsem is held for write, but it may have been dropped between * ->begin_enable_verity() and ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*end_enable_verity)(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size); /** * Get the verity descriptor of the given inode. * * @inode: an inode with the S_VERITY flag set * @buf: buffer in which to place the verity descriptor * @bufsize: size of @buf, or 0 to retrieve the size only * * If bufsize == 0, then the size of the verity descriptor is returned. * Otherwise the verity descriptor is written to 'buf' and its actual * size is returned; -ERANGE is returned if it's too large. This may be * called by multiple processes concurrently on the same inode. * * Return: the size on success, -errno on failure */ int (*get_verity_descriptor)(struct inode *inode, void *buf, size_t bufsize); /** * Read a Merkle tree page of the given inode. * * @inode: the inode * @index: 0-based index of the page within the Merkle tree * @num_ra_pages: The number of Merkle tree pages that should be * prefetched starting at @index if the page at @index * isn't already cached. Implementations may ignore this * argument; it's only a performance optimization. * * This can be called at any time on an open verity file, as well as * between ->begin_enable_verity() and ->end_enable_verity(). It may be * called by multiple processes concurrently, even with the same page. * * Note that this must retrieve a *page*, not necessarily a *block*. * * Return: the page on success, ERR_PTR() on failure */ struct page *(*read_merkle_tree_page)(struct inode *inode, pgoff_t index, unsigned long num_ra_pages); /** * Write a Merkle tree block to the given inode. * * @inode: the inode for which the Merkle tree is being built * @buf: block to write * @index: 0-based index of the block within the Merkle tree * @log_blocksize: log base 2 of the Merkle tree block size * * This is only called between ->begin_enable_verity() and * ->end_enable_verity(). * * Return: 0 on success, -errno on failure */ int (*write_merkle_tree_block)(struct inode *inode, const void *buf, u64 index, int log_blocksize); }; #ifdef CONFIG_FS_VERITY static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { /* * Pairs with the cmpxchg_release() in fsverity_set_info(). * I.e., another task may publish ->i_verity_info concurrently, * executing a RELEASE barrier. We need to use smp_load_acquire() here * to safely ACQUIRE the memory the other task published. */ return smp_load_acquire(&inode->i_verity_info); } /* enable.c */ int fsverity_ioctl_enable(struct file *filp, const void __user *arg); /* measure.c */ int fsverity_ioctl_measure(struct file *filp, void __user *arg); /* open.c */ int fsverity_file_open(struct inode *inode, struct file *filp); int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr); void fsverity_cleanup_inode(struct inode *inode); /* verify.c */ bool fsverity_verify_page(struct page *page); void fsverity_verify_bio(struct bio *bio); void fsverity_enqueue_verify_work(struct work_struct *work); #else /* !CONFIG_FS_VERITY */ static inline struct fsverity_info *fsverity_get_info(const struct inode *inode) { return NULL; } /* enable.c */ static inline int fsverity_ioctl_enable(struct file *filp, const void __user *arg) { return -EOPNOTSUPP; } /* measure.c */ static inline int fsverity_ioctl_measure(struct file *filp, void __user *arg) { return -EOPNOTSUPP; } /* open.c */ static inline int fsverity_file_open(struct inode *inode, struct file *filp) { return IS_VERITY(inode) ? -EOPNOTSUPP : 0; } static inline int fsverity_prepare_setattr(struct dentry *dentry, struct iattr *attr) { return IS_VERITY(d_inode(dentry)) ? -EOPNOTSUPP : 0; } static inline void fsverity_cleanup_inode(struct inode *inode) { } /* verify.c */ static inline bool fsverity_verify_page(struct page *page) { WARN_ON(1); return false; } static inline void fsverity_verify_bio(struct bio *bio) { WARN_ON(1); } static inline void fsverity_enqueue_verify_work(struct work_struct *work) { WARN_ON(1); } #endif /* !CONFIG_FS_VERITY */ /** * fsverity_active() - do reads from the inode need to go through fs-verity? * @inode: inode to check * * This checks whether ->i_verity_info has been set. * * Filesystems call this from ->readpages() to check whether the pages need to * be verified or not. Don't use IS_VERITY() for this purpose; it's subject to * a race condition where the file is being read concurrently with * FS_IOC_ENABLE_VERITY completing. (S_VERITY is set before ->i_verity_info.) * * Return: true if reads need to go through fs-verity, otherwise false */ static inline bool fsverity_active(const struct inode *inode) { return fsverity_get_info(inode) != NULL; } #endif /* _LINUX_FSVERITY_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 /* SPDX-License-Identifier: GPL-2.0 */ /* linux/net/inet/arp.h */ #ifndef _ARP_H #define _ARP_H #include <linux/if_arp.h> #include <linux/hash.h> #include <net/neighbour.h> extern struct neigh_table arp_tbl; static inline u32 arp_hashfn(const void *pkey, const struct net_device *dev, u32 *hash_rnd) { u32 key = *(const u32 *)pkey; u32 val = key ^ hash32_ptr(dev); return val * hash_rnd[0]; } #ifdef CONFIG_INET static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) key = INADDR_ANY; return ___neigh_lookup_noref(&arp_tbl, neigh_key_eq32, arp_hashfn, &key, dev); } #else static inline struct neighbour *__ipv4_neigh_lookup_noref(struct net_device *dev, u32 key) { return NULL; } #endif static inline struct neighbour *__ipv4_neigh_lookup(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock_bh(); n = __ipv4_neigh_lookup_noref(dev, key); if (n && !refcount_inc_not_zero(&n->refcnt)) n = NULL; rcu_read_unlock_bh(); return n; } static inline void __ipv4_confirm_neigh(struct net_device *dev, u32 key) { struct neighbour *n; rcu_read_lock_bh(); n = __ipv4_neigh_lookup_noref(dev, key); if (n) { unsigned long now = jiffies; /* avoid dirtying neighbour */ if (READ_ONCE(n->confirmed) != now) WRITE_ONCE(n->confirmed, now); } rcu_read_unlock_bh(); } void arp_init(void); int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg); void arp_send(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *th); int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir); void arp_ifdown(struct net_device *dev); struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, struct net_device *dev, __be32 src_ip, const unsigned char *dest_hw, const unsigned char *src_hw, const unsigned char *target_hw); void arp_xmit(struct sk_buff *skb); #endif /* _ARP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/genhd.h> #include "../blk.h" /* * add_gd_partition adds a partitions details to the devices partition * description. */ struct parsed_partitions { struct block_device *bdev; char name[BDEVNAME_SIZE]; struct { sector_t from; sector_t size; int flags; bool has_info; struct partition_meta_info info; } *parts; int next; int limit; bool access_beyond_eod; char *pp_buf; }; typedef struct { struct page *v; } Sector; void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); static inline void put_dev_sector(Sector p) { put_page(p.v); } static inline void put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) { if (n < p->limit) { char tmp[1 + BDEVNAME_SIZE + 10 + 1]; p->parts[n].from = from; p->parts[n].size = size; snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); strlcat(p->pp_buf, tmp, PAGE_SIZE); } } /* detection routines go here in alphabetical order: */ int adfspart_check_ADFS(struct parsed_partitions *state); int adfspart_check_CUMANA(struct parsed_partitions *state); int adfspart_check_EESOX(struct parsed_partitions *state); int adfspart_check_ICS(struct parsed_partitions *state); int adfspart_check_POWERTEC(struct parsed_partitions *state); int aix_partition(struct parsed_partitions *state); int amiga_partition(struct parsed_partitions *state); int atari_partition(struct parsed_partitions *state); int cmdline_partition(struct parsed_partitions *state); int efi_partition(struct parsed_partitions *state); int ibm_partition(struct parsed_partitions *); int karma_partition(struct parsed_partitions *state); int ldm_partition(struct parsed_partitions *state); int mac_partition(struct parsed_partitions *state); int msdos_partition(struct parsed_partitions *state); int osf_partition(struct parsed_partitions *state); int sgi_partition(struct parsed_partitions *state); int sun_partition(struct parsed_partitions *state); int sysv68_partition(struct parsed_partitions *state); int ultrix_partition(struct parsed_partitions *state);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * RNG: Random Number Generator algorithms under the crypto API * * Copyright (c) 2008 Neil Horman <nhorman@tuxdriver.com> * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au> */ #ifndef _CRYPTO_RNG_H #define _CRYPTO_RNG_H #include <linux/crypto.h> struct crypto_rng; /** * struct rng_alg - random number generator definition * * @generate: The function defined by this variable obtains a * random number. The random number generator transform * must generate the random number out of the context * provided with this call, plus any additional data * if provided to the call. * @seed: Seed or reseed the random number generator. With the * invocation of this function call, the random number * generator shall become ready for generation. If the * random number generator requires a seed for setting * up a new state, the seed must be provided by the * consumer while invoking this function. The required * size of the seed is defined with @seedsize . * @set_ent: Set entropy that would otherwise be obtained from * entropy source. Internal use only. * @seedsize: The seed size required for a random number generator * initialization defined with this variable. Some * random number generators does not require a seed * as the seeding is implemented internally without * the need of support by the consumer. In this case, * the seed size is set to zero. * @base: Common crypto API algorithm data structure. */ struct rng_alg { int (*generate)(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen); int (*seed)(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); void (*set_ent)(struct crypto_rng *tfm, const u8 *data, unsigned int len); unsigned int seedsize; struct crypto_alg base; }; struct crypto_rng { struct crypto_tfm base; }; extern struct crypto_rng *crypto_default_rng; int crypto_get_default_rng(void); void crypto_put_default_rng(void); /** * DOC: Random number generator API * * The random number generator API is used with the ciphers of type * CRYPTO_ALG_TYPE_RNG (listed as type "rng" in /proc/crypto) */ /** * crypto_alloc_rng() -- allocate RNG handle * @alg_name: is the cra_name / name or cra_driver_name / driver name of the * message digest cipher * @type: specifies the type of the cipher * @mask: specifies the mask for the cipher * * Allocate a cipher handle for a random number generator. The returned struct * crypto_rng is the cipher handle that is required for any subsequent * API invocation for that random number generator. * * For all random number generators, this call creates a new private copy of * the random number generator that does not share a state with other * instances. The only exception is the "krng" random number generator which * is a kernel crypto API use case for the get_random_bytes() function of the * /dev/random driver. * * Return: allocated cipher handle in case of success; IS_ERR() is true in case * of an error, PTR_ERR() returns the error code. */ struct crypto_rng *crypto_alloc_rng(const char *alg_name, u32 type, u32 mask); static inline struct crypto_tfm *crypto_rng_tfm(struct crypto_rng *tfm) { return &tfm->base; } /** * crypto_rng_alg - obtain name of RNG * @tfm: cipher handle * * Return the generic name (cra_name) of the initialized random number generator * * Return: generic name string */ static inline struct rng_alg *crypto_rng_alg(struct crypto_rng *tfm) { return container_of(crypto_rng_tfm(tfm)->__crt_alg, struct rng_alg, base); } /** * crypto_free_rng() - zeroize and free RNG handle * @tfm: cipher handle to be freed * * If @tfm is a NULL or error pointer, this function does nothing. */ static inline void crypto_free_rng(struct crypto_rng *tfm) { crypto_destroy_tfm(tfm, crypto_rng_tfm(tfm)); } /** * crypto_rng_generate() - get random number * @tfm: cipher handle * @src: Input buffer holding additional data, may be NULL * @slen: Length of additional data * @dst: output buffer holding the random numbers * @dlen: length of the output buffer * * This function fills the caller-allocated buffer with random * numbers using the random number generator referenced by the * cipher handle. * * Return: 0 function was successful; < 0 if an error occurred */ static inline int crypto_rng_generate(struct crypto_rng *tfm, const u8 *src, unsigned int slen, u8 *dst, unsigned int dlen) { struct crypto_alg *alg = tfm->base.__crt_alg; int ret; crypto_stats_get(alg); ret = crypto_rng_alg(tfm)->generate(tfm, src, slen, dst, dlen); crypto_stats_rng_generate(alg, dlen, ret); return ret; } /** * crypto_rng_get_bytes() - get random number * @tfm: cipher handle * @rdata: output buffer holding the random numbers * @dlen: length of the output buffer * * This function fills the caller-allocated buffer with random numbers using the * random number generator referenced by the cipher handle. * * Return: 0 function was successful; < 0 if an error occurred */ static inline int crypto_rng_get_bytes(struct crypto_rng *tfm, u8 *rdata, unsigned int dlen) { return crypto_rng_generate(tfm, NULL, 0, rdata, dlen); } /** * crypto_rng_reset() - re-initialize the RNG * @tfm: cipher handle * @seed: seed input data * @slen: length of the seed input data * * The reset function completely re-initializes the random number generator * referenced by the cipher handle by clearing the current state. The new state * is initialized with the caller provided seed or automatically, depending * on the random number generator type (the ANSI X9.31 RNG requires * caller-provided seed, the SP800-90A DRBGs perform an automatic seeding). * The seed is provided as a parameter to this function call. The provided seed * should have the length of the seed size defined for the random number * generator as defined by crypto_rng_seedsize. * * Return: 0 if the setting of the key was successful; < 0 if an error occurred */ int crypto_rng_reset(struct crypto_rng *tfm, const u8 *seed, unsigned int slen); /** * crypto_rng_seedsize() - obtain seed size of RNG * @tfm: cipher handle * * The function returns the seed size for the random number generator * referenced by the cipher handle. This value may be zero if the random * number generator does not implement or require a reseeding. For example, * the SP800-90A DRBGs implement an automated reseeding after reaching a * pre-defined threshold. * * Return: seed size for the random number generator */ static inline int crypto_rng_seedsize(struct crypto_rng *tfm) { return crypto_rng_alg(tfm)->seedsize; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Copyright (C) 2020 ARM Ltd. */ #ifndef __ASM_VDSO_PROCESSOR_H #define __ASM_VDSO_PROCESSOR_H #ifndef __ASSEMBLY__ /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } static __always_inline void cpu_relax(void) { rep_nop(); } #endif /* __ASSEMBLY__ */ #endif /* __ASM_VDSO_PROCESSOR_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 /* SPDX-License-Identifier: GPL-2.0 */ /* * A security context is a set of security attributes * associated with each subject and object controlled * by the security policy. Security contexts are * externally represented as variable-length strings * that can be interpreted by a user or application * with an understanding of the security policy. * Internally, the security server uses a simple * structure. This structure is private to the * security server and can be changed without affecting * clients of the security server. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ #ifndef _SS_CONTEXT_H_ #define _SS_CONTEXT_H_ #include "ebitmap.h" #include "mls_types.h" #include "security.h" /* * A security context consists of an authenticated user * identity, a role, a type and a MLS range. */ struct context { u32 user; u32 role; u32 type; u32 len; /* length of string in bytes */ struct mls_range range; char *str; /* string representation if context cannot be mapped. */ }; static inline void mls_context_init(struct context *c) { memset(&c->range, 0, sizeof(c->range)); } static inline int mls_context_cpy(struct context *dst, struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the low level of 'src'. */ static inline int mls_context_cpy_low(struct context *dst, struct context *src) { int rc; dst->range.level[0].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[0].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[0].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[0].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } /* * Sets both levels in the MLS range of 'dst' to the high level of 'src'. */ static inline int mls_context_cpy_high(struct context *dst, struct context *src) { int rc; dst->range.level[0].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[0].cat, &src->range.level[1].cat); if (rc) goto out; dst->range.level[1].sens = src->range.level[1].sens; rc = ebitmap_cpy(&dst->range.level[1].cat, &src->range.level[1].cat); if (rc) ebitmap_destroy(&dst->range.level[0].cat); out: return rc; } static inline int mls_context_glblub(struct context *dst, struct context *c1, struct context *c2) { struct mls_range *dr = &dst->range, *r1 = &c1->range, *r2 = &c2->range; int rc = 0; if (r1->level[1].sens < r2->level[0].sens || r2->level[1].sens < r1->level[0].sens) /* These ranges have no common sensitivities */ return -EINVAL; /* Take the greatest of the low */ dr->level[0].sens = max(r1->level[0].sens, r2->level[0].sens); /* Take the least of the high */ dr->level[1].sens = min(r1->level[1].sens, r2->level[1].sens); rc = ebitmap_and(&dr->level[0].cat, &r1->level[0].cat, &r2->level[0].cat); if (rc) goto out; rc = ebitmap_and(&dr->level[1].cat, &r1->level[1].cat, &r2->level[1].cat); if (rc) goto out; out: return rc; } static inline int mls_context_cmp(struct context *c1, struct context *c2) { return ((c1->range.level[0].sens == c2->range.level[0].sens) && ebitmap_cmp(&c1->range.level[0].cat, &c2->range.level[0].cat) && (c1->range.level[1].sens == c2->range.level[1].sens) && ebitmap_cmp(&c1->range.level[1].cat, &c2->range.level[1].cat)); } static inline void mls_context_destroy(struct context *c) { ebitmap_destroy(&c->range.level[0].cat); ebitmap_destroy(&c->range.level[1].cat); mls_context_init(c); } static inline void context_init(struct context *c) { memset(c, 0, sizeof(*c)); } static inline int context_cpy(struct context *dst, struct context *src) { int rc; dst->user = src->user; dst->role = src->role; dst->type = src->type; if (src->str) { dst->str = kstrdup(src->str, GFP_ATOMIC); if (!dst->str) return -ENOMEM; dst->len = src->len; } else { dst->str = NULL; dst->len = 0; } rc = mls_context_cpy(dst, src); if (rc) { kfree(dst->str); return rc; } return 0; } static inline void context_destroy(struct context *c) { c->user = c->role = c->type = 0; kfree(c->str); c->str = NULL; c->len = 0; mls_context_destroy(c); } static inline int context_cmp(struct context *c1, struct context *c2) { if (c1->len && c2->len) return (c1->len == c2->len && !strcmp(c1->str, c2->str)); if (c1->len || c2->len) return 0; return ((c1->user == c2->user) && (c1->role == c2->role) && (c1->type == c2->type) && mls_context_cmp(c1, c2)); } u32 context_compute_hash(const struct context *c); #endif /* _SS_CONTEXT_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_VMSTAT_H #define _LINUX_VMSTAT_H #include <linux/types.h> #include <linux/percpu.h> #include <linux/mmzone.h> #include <linux/vm_event_item.h> #include <linux/atomic.h> #include <linux/static_key.h> #include <linux/mmdebug.h> extern int sysctl_stat_interval; #ifdef CONFIG_NUMA #define ENABLE_NUMA_STAT 1 #define DISABLE_NUMA_STAT 0 extern int sysctl_vm_numa_stat; DECLARE_STATIC_KEY_TRUE(vm_numa_stat_key); int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos); #endif struct reclaim_stat { unsigned nr_dirty; unsigned nr_unqueued_dirty; unsigned nr_congested; unsigned nr_writeback; unsigned nr_immediate; unsigned nr_pageout; unsigned nr_activate[ANON_AND_FILE]; unsigned nr_ref_keep; unsigned nr_unmap_fail; unsigned nr_lazyfree_fail; }; enum writeback_stat_item { NR_DIRTY_THRESHOLD, NR_DIRTY_BG_THRESHOLD, NR_VM_WRITEBACK_STAT_ITEMS, }; #ifdef CONFIG_VM_EVENT_COUNTERS /* * Light weight per cpu counter implementation. * * Counters should only be incremented and no critical kernel component * should rely on the counter values. * * Counters are handled completely inline. On many platforms the code * generated will simply be the increment of a global address. */ struct vm_event_state { unsigned long event[NR_VM_EVENT_ITEMS]; }; DECLARE_PER_CPU(struct vm_event_state, vm_event_states); /* * vm counters are allowed to be racy. Use raw_cpu_ops to avoid the * local_irq_disable overhead. */ static inline void __count_vm_event(enum vm_event_item item) { raw_cpu_inc(vm_event_states.event[item]); } static inline void count_vm_event(enum vm_event_item item) { this_cpu_inc(vm_event_states.event[item]); } static inline void __count_vm_events(enum vm_event_item item, long delta) { raw_cpu_add(vm_event_states.event[item], delta); } static inline void count_vm_events(enum vm_event_item item, long delta) { this_cpu_add(vm_event_states.event[item], delta); } extern void all_vm_events(unsigned long *); extern void vm_events_fold_cpu(int cpu); #else /* Disable counters */ static inline void count_vm_event(enum vm_event_item item) { } static inline void count_vm_events(enum vm_event_item item, long delta) { } static inline void __count_vm_event(enum vm_event_item item) { } static inline void __count_vm_events(enum vm_event_item item, long delta) { } static inline void all_vm_events(unsigned long *ret) { } static inline void vm_events_fold_cpu(int cpu) { } #endif /* CONFIG_VM_EVENT_COUNTERS */ #ifdef CONFIG_NUMA_BALANCING #define count_vm_numa_event(x) count_vm_event(x) #define count_vm_numa_events(x, y) count_vm_events(x, y) #else #define count_vm_numa_event(x) do {} while (0) #define count_vm_numa_events(x, y) do { (void)(y); } while (0) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_DEBUG_TLBFLUSH #define count_vm_tlb_event(x) count_vm_event(x) #define count_vm_tlb_events(x, y) count_vm_events(x, y) #else #define count_vm_tlb_event(x) do {} while (0) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif #ifdef CONFIG_DEBUG_VM_VMACACHE #define count_vm_vmacache_event(x) count_vm_event(x) #else #define count_vm_vmacache_event(x) do {} while (0) #endif #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) /* * Zone and node-based page accounting with per cpu differentials. */ extern atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS]; extern atomic_long_t vm_numa_stat[NR_VM_NUMA_STAT_ITEMS]; extern atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS]; #ifdef CONFIG_NUMA static inline void zone_numa_state_add(long x, struct zone *zone, enum numa_stat_item item) { atomic_long_add(x, &zone->vm_numa_stat[item]); atomic_long_add(x, &vm_numa_stat[item]); } static inline unsigned long global_numa_state(enum numa_stat_item item) { long x = atomic_long_read(&vm_numa_stat[item]); return x; } static inline unsigned long zone_numa_state_snapshot(struct zone *zone, enum numa_stat_item item) { long x = atomic_long_read(&zone->vm_numa_stat[item]); int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->pageset, cpu)->vm_numa_stat_diff[item]; return x; } #endif /* CONFIG_NUMA */ static inline void zone_page_state_add(long x, struct zone *zone, enum zone_stat_item item) { atomic_long_add(x, &zone->vm_stat[item]); atomic_long_add(x, &vm_zone_stat[item]); } static inline void node_page_state_add(long x, struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_add(x, &pgdat->vm_stat[item]); atomic_long_add(x, &vm_node_stat[item]); } static inline unsigned long global_zone_page_state(enum zone_stat_item item) { long x = atomic_long_read(&vm_zone_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state_pages(enum node_stat_item item) { long x = atomic_long_read(&vm_node_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } static inline unsigned long global_node_page_state(enum node_stat_item item) { VM_WARN_ON_ONCE(vmstat_item_in_bytes(item)); return global_node_page_state_pages(item); } static inline unsigned long zone_page_state(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP if (x < 0) x = 0; #endif return x; } /* * More accurate version that also considers the currently pending * deltas. For that we need to loop over all cpus to find the current * deltas. There is no synchronization so the result cannot be * exactly accurate either. */ static inline unsigned long zone_page_state_snapshot(struct zone *zone, enum zone_stat_item item) { long x = atomic_long_read(&zone->vm_stat[item]); #ifdef CONFIG_SMP int cpu; for_each_online_cpu(cpu) x += per_cpu_ptr(zone->pageset, cpu)->vm_stat_diff[item]; if (x < 0) x = 0; #endif return x; } #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, enum zone_stat_item item); extern unsigned long sum_zone_numa_state(int node, enum numa_stat_item item); extern unsigned long node_page_state(struct pglist_data *pgdat, enum node_stat_item item); extern unsigned long node_page_state_pages(struct pglist_data *pgdat, enum node_stat_item item); #else #define sum_zone_node_page_state(node, item) global_zone_page_state(item) #define node_page_state(node, item) global_node_page_state(item) #define node_page_state_pages(node, item) global_node_page_state_pages(item) #endif /* CONFIG_NUMA */ #ifdef CONFIG_SMP void __mod_zone_page_state(struct zone *, enum zone_stat_item item, long); void __inc_zone_page_state(struct page *, enum zone_stat_item); void __dec_zone_page_state(struct page *, enum zone_stat_item); void __mod_node_page_state(struct pglist_data *, enum node_stat_item item, long); void __inc_node_page_state(struct page *, enum node_stat_item); void __dec_node_page_state(struct page *, enum node_stat_item); void mod_zone_page_state(struct zone *, enum zone_stat_item, long); void inc_zone_page_state(struct page *, enum zone_stat_item); void dec_zone_page_state(struct page *, enum zone_stat_item); void mod_node_page_state(struct pglist_data *, enum node_stat_item, long); void inc_node_page_state(struct page *, enum node_stat_item); void dec_node_page_state(struct page *, enum node_stat_item); extern void inc_node_state(struct pglist_data *, enum node_stat_item); extern void __inc_zone_state(struct zone *, enum zone_stat_item); extern void __inc_node_state(struct pglist_data *, enum node_stat_item); extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_node_state(struct pglist_data *, enum node_stat_item); void quiet_vmstat(void); void cpu_vm_stats_fold(int cpu); void refresh_zone_stat_thresholds(void); struct ctl_table; int vmstat_refresh(struct ctl_table *, int write, void *buffer, size_t *lenp, loff_t *ppos); void drain_zonestat(struct zone *zone, struct per_cpu_pageset *); int calculate_pressure_threshold(struct zone *zone); int calculate_normal_threshold(struct zone *zone); void set_pgdat_percpu_threshold(pg_data_t *pgdat, int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* * We do not maintain differentials in a single processor configuration. * The functions directly modify the zone and global counters. */ static inline void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, long delta) { zone_page_state_add(delta, zone, item); } static inline void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item, int delta) { if (vmstat_item_in_bytes(item)) { VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1)); delta >>= PAGE_SHIFT; } node_page_state_add(delta, pgdat, item); } static inline void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_inc(&zone->vm_stat[item]); atomic_long_inc(&vm_zone_stat[item]); } static inline void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_inc(&pgdat->vm_stat[item]); atomic_long_inc(&vm_node_stat[item]); } static inline void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { atomic_long_dec(&zone->vm_stat[item]); atomic_long_dec(&vm_zone_stat[item]); } static inline void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item) { atomic_long_dec(&pgdat->vm_stat[item]); atomic_long_dec(&vm_node_stat[item]); } static inline void __inc_zone_page_state(struct page *page, enum zone_stat_item item) { __inc_zone_state(page_zone(page), item); } static inline void __inc_node_page_state(struct page *page, enum node_stat_item item) { __inc_node_state(page_pgdat(page), item); } static inline void __dec_zone_page_state(struct page *page, enum zone_stat_item item) { __dec_zone_state(page_zone(page), item); } static inline void __dec_node_page_state(struct page *page, enum node_stat_item item) { __dec_node_state(page_pgdat(page), item); } /* * We only use atomic operations to update counters. So there is no need to * disable interrupts. */ #define inc_zone_page_state __inc_zone_page_state #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state #define inc_node_page_state __inc_node_page_state #define dec_node_page_state __dec_node_page_state #define mod_node_page_state __mod_node_page_state #define inc_zone_state __inc_zone_state #define inc_node_state __inc_node_state #define dec_zone_state __dec_zone_state #define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_zone_stat_thresholds(void) { } static inline void cpu_vm_stats_fold(int cpu) { } static inline void quiet_vmstat(void) { } static inline void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset) { } #endif /* CONFIG_SMP */ static inline void __mod_zone_freepage_state(struct zone *zone, int nr_pages, int migratetype) { __mod_zone_page_state(zone, NR_FREE_PAGES, nr_pages); if (is_migrate_cma(migratetype)) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, nr_pages); } extern const char * const vmstat_text[]; static inline const char *zone_stat_name(enum zone_stat_item item) { return vmstat_text[item]; } #ifdef CONFIG_NUMA static inline const char *numa_stat_name(enum numa_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + item]; } #endif /* CONFIG_NUMA */ static inline const char *node_stat_name(enum node_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_STAT_ITEMS + item]; } static inline const char *lru_list_name(enum lru_list lru) { return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_" } static inline const char *writeback_stat_name(enum writeback_stat_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_STAT_ITEMS + NR_VM_NODE_STAT_ITEMS + item]; } #if defined(CONFIG_VM_EVENT_COUNTERS) || defined(CONFIG_MEMCG) static inline const char *vm_event_name(enum vm_event_item item) { return vmstat_text[NR_VM_ZONE_STAT_ITEMS + NR_VM_NUMA_STAT_ITEMS + NR_VM_NODE_STAT_ITEMS + NR_VM_WRITEBACK_STAT_ITEMS + item]; } #endif /* CONFIG_VM_EVENT_COUNTERS || CONFIG_MEMCG */ #endif /* _LINUX_VMSTAT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 /* SPDX-License-Identifier: GPL-2.0 */ /* * INETPEER - A storage for permanent information about peers * * Authors: Andrey V. Savochkin <saw@msu.ru> */ #ifndef _NET_INETPEER_H #define _NET_INETPEER_H #include <linux/types.h> #include <linux/init.h> #include <linux/jiffies.h> #include <linux/spinlock.h> #include <linux/rtnetlink.h> #include <net/ipv6.h> #include <linux/atomic.h> /* IPv4 address key for cache lookups */ struct ipv4_addr_key { __be32 addr; int vif; }; #define INETPEER_MAXKEYSZ (sizeof(struct in6_addr) / sizeof(u32)) struct inetpeer_addr { union { struct ipv4_addr_key a4; struct in6_addr a6; u32 key[INETPEER_MAXKEYSZ]; }; __u16 family; }; struct inet_peer { struct rb_node rb_node; struct inetpeer_addr daddr; u32 metrics[RTAX_MAX]; u32 rate_tokens; /* rate limiting for ICMP */ u32 n_redirects; unsigned long rate_last; /* * Once inet_peer is queued for deletion (refcnt == 0), following field * is not available: rid * We can share memory with rcu_head to help keep inet_peer small. */ union { struct { atomic_t rid; /* Frag reception counter */ }; struct rcu_head rcu; }; /* following fields might be frequently dirtied */ __u32 dtime; /* the time of last use of not referenced entries */ refcount_t refcnt; }; struct inet_peer_base { struct rb_root rb_root; seqlock_t lock; int total; }; void inet_peer_base_init(struct inet_peer_base *); void inet_initpeers(void) __init; #define INETPEER_METRICS_NEW (~(u32) 0) static inline void inetpeer_set_addr_v4(struct inetpeer_addr *iaddr, __be32 ip) { iaddr->a4.addr = ip; iaddr->a4.vif = 0; iaddr->family = AF_INET; } static inline __be32 inetpeer_get_addr_v4(struct inetpeer_addr *iaddr) { return iaddr->a4.addr; } static inline void inetpeer_set_addr_v6(struct inetpeer_addr *iaddr, struct in6_addr *in6) { iaddr->a6 = *in6; iaddr->family = AF_INET6; } static inline struct in6_addr *inetpeer_get_addr_v6(struct inetpeer_addr *iaddr) { return &iaddr->a6; } /* can be called with or without local BH being disabled */ struct inet_peer *inet_getpeer(struct inet_peer_base *base, const struct inetpeer_addr *daddr, int create); static inline struct inet_peer *inet_getpeer_v4(struct inet_peer_base *base, __be32 v4daddr, int vif, int create) { struct inetpeer_addr daddr; daddr.a4.addr = v4daddr; daddr.a4.vif = vif; daddr.family = AF_INET; return inet_getpeer(base, &daddr, create); } static inline struct inet_peer *inet_getpeer_v6(struct inet_peer_base *base, const struct in6_addr *v6daddr, int create) { struct inetpeer_addr daddr; daddr.a6 = *v6daddr; daddr.family = AF_INET6; return inet_getpeer(base, &daddr, create); } static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a, const struct inetpeer_addr *b) { int i, n; if (a->family == AF_INET) n = sizeof(a->a4) / sizeof(u32); else n = sizeof(a->a6) / sizeof(u32); for (i = 0; i < n; i++) { if (a->key[i] == b->key[i]) continue; if (a->key[i] < b->key[i]) return -1; return 1; } return 0; } /* can be called from BH context or outside */ void inet_putpeer(struct inet_peer *p); bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout); void inetpeer_invalidate_tree(struct inet_peer_base *); #endif /* _NET_INETPEER_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* Red Black Trees (C) 1999 Andrea Arcangeli <andrea@suse.de> linux/include/linux/rbtree.h To use rbtrees you'll have to implement your own insert and search cores. This will avoid us to use callbacks and to drop drammatically performances. I know it's not the cleaner way, but in C (not in C++) to get performances and genericity... See Documentation/core-api/rbtree.rst for documentation and samples. */ #ifndef _LINUX_RBTREE_H #define _LINUX_RBTREE_H #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/rcupdate.h> struct rb_node { unsigned long __rb_parent_color; struct rb_node *rb_right; struct rb_node *rb_left; } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ struct rb_root { struct rb_node *rb_node; }; #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } #define rb_entry(ptr, type, member) container_of(ptr, type, member) #define RB_EMPTY_ROOT(root) (READ_ONCE((root)->rb_node) == NULL) /* 'empty' nodes are nodes that are known not to be inserted in an rbtree */ #define RB_EMPTY_NODE(node) \ ((node)->__rb_parent_color == (unsigned long)(node)) #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); extern struct rb_node *rb_prev(const struct rb_node *); extern struct rb_node *rb_first(const struct rb_root *); extern struct rb_node *rb_last(const struct rb_root *); /* Postorder iteration - always visit the parent after its children */ extern struct rb_node *rb_first_postorder(const struct rb_root *); extern struct rb_node *rb_next_postorder(const struct rb_node *); /* Fast replacement of a single node without remove/rebalance/add/rebalance */ extern void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); extern void rb_replace_node_rcu(struct rb_node *victim, struct rb_node *new, struct rb_root *root); static inline void rb_link_node(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; *rb_link = node; } static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent, struct rb_node **rb_link) { node->__rb_parent_color = (unsigned long)parent; node->rb_left = node->rb_right = NULL; rcu_assign_pointer(*rb_link, node); } #define rb_entry_safe(ptr, type, member) \ ({ typeof(ptr) ____ptr = (ptr); \ ____ptr ? rb_entry(____ptr, type, member) : NULL; \ }) /** * rbtree_postorder_for_each_entry_safe - iterate in post-order over rb_root of * given type allowing the backing memory of @pos to be invalidated * * @pos: the 'type *' to use as a loop cursor. * @n: another 'type *' to use as temporary storage * @root: 'rb_root *' of the rbtree. * @field: the name of the rb_node field within 'type'. * * rbtree_postorder_for_each_entry_safe() provides a similar guarantee as * list_for_each_entry_safe() and allows the iteration to continue independent * of changes to @pos by the body of the loop. * * Note, however, that it cannot handle other modifications that re-order the * rbtree it is iterating over. This includes calling rb_erase() on @pos, as * rb_erase() may rebalance the tree, causing us to miss some nodes. */ #define rbtree_postorder_for_each_entry_safe(pos, n, root, field) \ for (pos = rb_entry_safe(rb_first_postorder(root), typeof(*pos), field); \ pos && ({ n = rb_entry_safe(rb_next_postorder(&pos->field), \ typeof(*pos), field); 1; }); \ pos = n) /* * Leftmost-cached rbtrees. * * We do not cache the rightmost node based on footprint * size vs number of potential users that could benefit * from O(1) rb_last(). Just not worth it, users that want * this feature can always implement the logic explicitly. * Furthermore, users that want to cache both pointers may * find it a bit asymmetric, but that's ok. */ struct rb_root_cached { struct rb_root rb_root; struct rb_node *rb_leftmost; }; #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } /* Same as rb_first(), but O(1) */ #define rb_first_cached(root) (root)->rb_leftmost static inline void rb_insert_color_cached(struct rb_node *node, struct rb_root_cached *root, bool leftmost) { if (leftmost) root->rb_leftmost = node; rb_insert_color(node, &root->rb_root); } static inline void rb_erase_cached(struct rb_node *node, struct rb_root_cached *root) { if (root->rb_leftmost == node) root->rb_leftmost = rb_next(node); rb_erase(node, &root->rb_root); } static inline void rb_replace_node_cached(struct rb_node *victim, struct rb_node *new, struct rb_root_cached *root) { if (root->rb_leftmost == victim) root->rb_leftmost = new; rb_replace_node(victim, new, &root->rb_root); } #endif /* _LINUX_RBTREE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 /* SPDX-License-Identifier: GPL-2.0 */ /* * Statically sized hash table implementation * (C) 2012 Sasha Levin <levinsasha928@gmail.com> */ #ifndef _LINUX_HASHTABLE_H #define _LINUX_HASHTABLE_H #include <linux/list.h> #include <linux/types.h> #include <linux/kernel.h> #include <linux/hash.h> #include <linux/rculist.h> #define DEFINE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DEFINE_READ_MOSTLY_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] __read_mostly = \ { [0 ... ((1 << (bits)) - 1)] = HLIST_HEAD_INIT } #define DECLARE_HASHTABLE(name, bits) \ struct hlist_head name[1 << (bits)] #define HASH_SIZE(name) (ARRAY_SIZE(name)) #define HASH_BITS(name) ilog2(HASH_SIZE(name)) /* Use hash_32 when possible to allow for fast 32bit hashing in 64bit kernels. */ #define hash_min(val, bits) \ (sizeof(val) <= 4 ? hash_32(val, bits) : hash_long(val, bits)) static inline void __hash_init(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) INIT_HLIST_HEAD(&ht[i]); } /** * hash_init - initialize a hash table * @hashtable: hashtable to be initialized * * Calculates the size of the hashtable from the given parameter, otherwise * same as hash_init_size. * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_init(hashtable) __hash_init(hashtable, HASH_SIZE(hashtable)) /** * hash_add - add an object to a hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add(hashtable, node, key) \ hlist_add_head(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_add_rcu - add an object to a rcu enabled hashtable * @hashtable: hashtable to add to * @node: the &struct hlist_node of the object to be added * @key: the key of the object to be added */ #define hash_add_rcu(hashtable, node, key) \ hlist_add_head_rcu(node, &hashtable[hash_min(key, HASH_BITS(hashtable))]) /** * hash_hashed - check whether an object is in any hashtable * @node: the &struct hlist_node of the object to be checked */ static inline bool hash_hashed(struct hlist_node *node) { return !hlist_unhashed(node); } static inline bool __hash_empty(struct hlist_head *ht, unsigned int sz) { unsigned int i; for (i = 0; i < sz; i++) if (!hlist_empty(&ht[i])) return false; return true; } /** * hash_empty - check whether a hashtable is empty * @hashtable: hashtable to check * * This has to be a macro since HASH_BITS() will not work on pointers since * it calculates the size during preprocessing. */ #define hash_empty(hashtable) __hash_empty(hashtable, HASH_SIZE(hashtable)) /** * hash_del - remove an object from a hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del(struct hlist_node *node) { hlist_del_init(node); } /** * hash_del_rcu - remove an object from a rcu enabled hashtable * @node: &struct hlist_node of the object to remove */ static inline void hash_del_rcu(struct hlist_node *node) { hlist_del_init_rcu(node); } /** * hash_for_each - iterate over a hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry(obj, &name[bkt], member) /** * hash_for_each_rcu - iterate over a rcu enabled hashtable * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_rcu(name, bkt, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_rcu(obj, &name[bkt], member) /** * hash_for_each_safe - iterate over a hashtable safe against removal of * hash entry * @name: hashtable to iterate * @bkt: integer to use as bucket loop cursor * @tmp: a &struct hlist_node used for temporary storage * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct */ #define hash_for_each_safe(name, bkt, tmp, obj, member) \ for ((bkt) = 0, obj = NULL; obj == NULL && (bkt) < HASH_SIZE(name);\ (bkt)++)\ hlist_for_each_entry_safe(obj, tmp, &name[bkt], member) /** * hash_for_each_possible - iterate over all possible objects hashing to the * same bucket * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible(name, obj, member, key) \ hlist_for_each_entry(obj, &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_rcu - iterate over all possible objects hashing to the * same bucket in an rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_rcu(name, obj, member, key, cond...) \ hlist_for_each_entry_rcu(obj, &name[hash_min(key, HASH_BITS(name))],\ member, ## cond) /** * hash_for_each_possible_rcu_notrace - iterate over all possible objects hashing * to the same bucket in an rcu enabled hashtable in a rcu enabled hashtable * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over * * This is the same as hash_for_each_possible_rcu() except that it does * not do any RCU debugging or tracing. */ #define hash_for_each_possible_rcu_notrace(name, obj, member, key) \ hlist_for_each_entry_rcu_notrace(obj, \ &name[hash_min(key, HASH_BITS(name))], member) /** * hash_for_each_possible_safe - iterate over all possible objects hashing to the * same bucket safe against removals * @name: hashtable to iterate * @obj: the type * to use as a loop cursor for each entry * @tmp: a &struct hlist_node used for temporary storage * @member: the name of the hlist_node within the struct * @key: the key of the objects to iterate over */ #define hash_for_each_possible_safe(name, obj, tmp, member, key) \ hlist_for_each_entry_safe(obj, tmp,\ &name[hash_min(key, HASH_BITS(name))], member) #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM tlb #if !defined(_TRACE_TLB_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_TLB_H #include <linux/mm_types.h> #include <linux/tracepoint.h> #define TLB_FLUSH_REASON \ EM( TLB_FLUSH_ON_TASK_SWITCH, "flush on task switch" ) \ EM( TLB_REMOTE_SHOOTDOWN, "remote shootdown" ) \ EM( TLB_LOCAL_SHOOTDOWN, "local shootdown" ) \ EM( TLB_LOCAL_MM_SHOOTDOWN, "local mm shootdown" ) \ EMe( TLB_REMOTE_SEND_IPI, "remote ipi send" ) /* * First define the enums in TLB_FLUSH_REASON to be exported to userspace * via TRACE_DEFINE_ENUM(). */ #undef EM #undef EMe #define EM(a,b) TRACE_DEFINE_ENUM(a); #define EMe(a,b) TRACE_DEFINE_ENUM(a); TLB_FLUSH_REASON /* * Now redefine the EM() and EMe() macros to map the enums to the strings * that will be printed in the output. */ #undef EM #undef EMe #define EM(a,b) { a, b }, #define EMe(a,b) { a, b } TRACE_EVENT(tlb_flush, TP_PROTO(int reason, unsigned long pages), TP_ARGS(reason, pages), TP_STRUCT__entry( __field( int, reason) __field(unsigned long, pages) ), TP_fast_assign( __entry->reason = reason; __entry->pages = pages; ), TP_printk("pages:%ld reason:%s (%d)", __entry->pages, __print_symbolic(__entry->reason, TLB_FLUSH_REASON), __entry->reason) ); #endif /* _TRACE_TLB_H */ /* This part must be outside protection */ #include <trace/define_trace.h>
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_SCHED_COREDUMP_H #define _LINUX_SCHED_COREDUMP_H #include <linux/mm_types.h> #define SUID_DUMP_DISABLE 0 /* No setuid dumping */ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ /* mm flags */ /* for SUID_DUMP_* above */ #define MMF_DUMPABLE_BITS 2 #define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things * that are using this for checking for privilege transitions, it must * test against SUID_DUMP_USER rather than treating it as a boolean * value. */ static inline int __get_dumpable(unsigned long mm_flags) { return mm_flags & MMF_DUMPABLE_MASK; } static inline int get_dumpable(struct mm_struct *mm) { return __get_dumpable(mm->flags); } /* coredump filter bits */ #define MMF_DUMP_ANON_PRIVATE 2 #define MMF_DUMP_ANON_SHARED 3 #define MMF_DUMP_MAPPED_PRIVATE 4 #define MMF_DUMP_MAPPED_SHARED 5 #define MMF_DUMP_ELF_HEADERS 6 #define MMF_DUMP_HUGETLB_PRIVATE 7 #define MMF_DUMP_HUGETLB_SHARED 8 #define MMF_DUMP_DAX_PRIVATE 9 #define MMF_DUMP_DAX_SHARED 10 #define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS #define MMF_DUMP_FILTER_BITS 9 #define MMF_DUMP_FILTER_MASK \ (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) #define MMF_DUMP_FILTER_DEFAULT \ ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) #ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS # define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) #else # define MMF_DUMP_MASK_DEFAULT_ELF 0 #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ /* * This one-shot flag is dropped due to necessity of changing exe once again * on NFS restore */ //#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_HAS_UPROBES 19 /* has uprobes */ #define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ #define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ #define MMF_OOM_VICTIM 25 /* mm is the oom victim */ #define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ #define MMF_MULTIPROCESS 27 /* mm is shared between processes */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ MMF_DISABLE_THP_MASK) #endif /* _LINUX_SCHED_COREDUMP_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 /* SPDX-License-Identifier: GPL-2.0-only */ /* * A policy database (policydb) specifies the * configuration data for the security policy. * * Author : Stephen Smalley, <sds@tycho.nsa.gov> */ /* * Updated: Trusted Computer Solutions, Inc. <dgoeddel@trustedcs.com> * * Support for enhanced MLS infrastructure. * * Updated: Frank Mayer <mayerf@tresys.com> and Karl MacMillan <kmacmillan@tresys.com> * * Added conditional policy language extensions * * Copyright (C) 2004-2005 Trusted Computer Solutions, Inc. * Copyright (C) 2003 - 2004 Tresys Technology, LLC */ #ifndef _SS_POLICYDB_H_ #define _SS_POLICYDB_H_ #include "symtab.h" #include "avtab.h" #include "sidtab.h" #include "ebitmap.h" #include "mls_types.h" #include "context.h" #include "constraint.h" /* * A datum type is defined for each kind of symbol * in the configuration data: individual permissions, * common prefixes for access vectors, classes, * users, roles, types, sensitivities, categories, etc. */ /* Permission attributes */ struct perm_datum { u32 value; /* permission bit + 1 */ }; /* Attributes of a common prefix for access vectors */ struct common_datum { u32 value; /* internal common value */ struct symtab permissions; /* common permissions */ }; /* Class attributes */ struct class_datum { u32 value; /* class value */ char *comkey; /* common name */ struct common_datum *comdatum; /* common datum */ struct symtab permissions; /* class-specific permission symbol table */ struct constraint_node *constraints; /* constraints on class permissions */ struct constraint_node *validatetrans; /* special transition rules */ /* Options how a new object user, role, and type should be decided */ #define DEFAULT_SOURCE 1 #define DEFAULT_TARGET 2 char default_user; char default_role; char default_type; /* Options how a new object range should be decided */ #define DEFAULT_SOURCE_LOW 1 #define DEFAULT_SOURCE_HIGH 2 #define DEFAULT_SOURCE_LOW_HIGH 3 #define DEFAULT_TARGET_LOW 4 #define DEFAULT_TARGET_HIGH 5 #define DEFAULT_TARGET_LOW_HIGH 6 #define DEFAULT_GLBLUB 7 char default_range; }; /* Role attributes */ struct role_datum { u32 value; /* internal role value */ u32 bounds; /* boundary of role */ struct ebitmap dominates; /* set of roles dominated by this role */ struct ebitmap types; /* set of authorized types for role */ }; struct role_trans_key { u32 role; /* current role */ u32 type; /* program executable type, or new object type */ u32 tclass; /* process class, or new object class */ }; struct role_trans_datum { u32 new_role; /* new role */ }; struct filename_trans_key { u32 ttype; /* parent dir context */ u16 tclass; /* class of new object */ const char *name; /* last path component */ }; struct filename_trans_datum { struct ebitmap stypes; /* bitmap of source types for this otype */ u32 otype; /* resulting type of new object */ struct filename_trans_datum *next; /* record for next otype*/ }; struct role_allow { u32 role; /* current role */ u32 new_role; /* new role */ struct role_allow *next; }; /* Type attributes */ struct type_datum { u32 value; /* internal type value */ u32 bounds; /* boundary of type */ unsigned char primary; /* primary name? */ unsigned char attribute;/* attribute ?*/ }; /* User attributes */ struct user_datum { u32 value; /* internal user value */ u32 bounds; /* bounds of user */ struct ebitmap roles; /* set of authorized roles for user */ struct mls_range range; /* MLS range (min - max) for user */ struct mls_level dfltlevel; /* default login MLS level for user */ }; /* Sensitivity attributes */ struct level_datum { struct mls_level *level; /* sensitivity and associated categories */ unsigned char isalias; /* is this sensitivity an alias for another? */ }; /* Category attributes */ struct cat_datum { u32 value; /* internal category bit + 1 */ unsigned char isalias; /* is this category an alias for another? */ }; struct range_trans { u32 source_type; u32 target_type; u32 target_class; }; /* Boolean data type */ struct cond_bool_datum { __u32 value; /* internal type value */ int state; }; struct cond_node; /* * type set preserves data needed to determine constraint info from * policy source. This is not used by the kernel policy but allows * utilities such as audit2allow to determine constraint denials. */ struct type_set { struct ebitmap types; struct ebitmap negset; u32 flags; }; /* * The configuration data includes security contexts for * initial SIDs, unlabeled file systems, TCP and UDP port numbers, * network interfaces, and nodes. This structure stores the * relevant data for one such entry. Entries of the same kind * (e.g. all initial SIDs) are linked together into a list. */ struct ocontext { union { char *name; /* name of initial SID, fs, netif, fstype, path */ struct { u8 protocol; u16 low_port; u16 high_port; } port; /* TCP or UDP port information */ struct { u32 addr; u32 mask; } node; /* node information */ struct { u32 addr[4]; u32 mask[4]; } node6; /* IPv6 node information */ struct { u64 subnet_prefix; u16 low_pkey; u16 high_pkey; } ibpkey; struct { char *dev_name; u8 port; } ibendport; } u; union { u32 sclass; /* security class for genfs */ u32 behavior; /* labeling behavior for fs_use */ } v; struct context context[2]; /* security context(s) */ u32 sid[2]; /* SID(s) */ struct ocontext *next; }; struct genfs { char *fstype; struct ocontext *head; struct genfs *next; }; /* symbol table array indices */ #define SYM_COMMONS 0 #define SYM_CLASSES 1 #define SYM_ROLES 2 #define SYM_TYPES 3 #define SYM_USERS 4 #define SYM_BOOLS 5 #define SYM_LEVELS 6 #define SYM_CATS 7 #define SYM_NUM 8 /* object context array indices */ #define OCON_ISID 0 /* initial SIDs */ #define OCON_FS 1 /* unlabeled file systems */ #define OCON_PORT 2 /* TCP and UDP port numbers */ #define OCON_NETIF 3 /* network interfaces */ #define OCON_NODE 4 /* nodes */ #define OCON_FSUSE 5 /* fs_use */ #define OCON_NODE6 6 /* IPv6 nodes */ #define OCON_IBPKEY 7 /* Infiniband PKeys */ #define OCON_IBENDPORT 8 /* Infiniband end ports */ #define OCON_NUM 9 /* The policy database */ struct policydb { int mls_enabled; /* symbol tables */ struct symtab symtab[SYM_NUM]; #define p_commons symtab[SYM_COMMONS] #define p_classes symtab[SYM_CLASSES] #define p_roles symtab[SYM_ROLES] #define p_types symtab[SYM_TYPES] #define p_users symtab[SYM_USERS] #define p_bools symtab[SYM_BOOLS] #define p_levels symtab[SYM_LEVELS] #define p_cats symtab[SYM_CATS] /* symbol names indexed by (value - 1) */ char **sym_val_to_name[SYM_NUM]; /* class, role, and user attributes indexed by (value - 1) */ struct class_datum **class_val_to_struct; struct role_datum **role_val_to_struct; struct user_datum **user_val_to_struct; struct type_datum **type_val_to_struct; /* type enforcement access vectors and transitions */ struct avtab te_avtab; /* role transitions */ struct hashtab role_tr; /* file transitions with the last path component */ /* quickly exclude lookups when parent ttype has no rules */ struct ebitmap filename_trans_ttypes; /* actual set of filename_trans rules */ struct hashtab filename_trans; /* only used if policyvers < POLICYDB_VERSION_COMP_FTRANS */ u32 compat_filename_trans_count; /* bools indexed by (value - 1) */ struct cond_bool_datum **bool_val_to_struct; /* type enforcement conditional access vectors and transitions */ struct avtab te_cond_avtab; /* array indexing te_cond_avtab by conditional */ struct cond_node *cond_list; u32 cond_list_len; /* role allows */ struct role_allow *role_allow; /* security contexts of initial SIDs, unlabeled file systems, TCP or UDP port numbers, network interfaces and nodes */ struct ocontext *ocontexts[OCON_NUM]; /* security contexts for files in filesystems that cannot support a persistent label mapping or use another fixed labeling behavior. */ struct genfs *genfs; /* range transitions table (range_trans_key -> mls_range) */ struct hashtab range_tr; /* type -> attribute reverse mapping */ struct ebitmap *type_attr_map_array; struct ebitmap policycaps; struct ebitmap permissive_map; /* length of this policy when it was loaded */ size_t len; unsigned int policyvers; unsigned int reject_unknown : 1; unsigned int allow_unknown : 1; u16 process_class; u32 process_trans_perms; } __randomize_layout; extern void policydb_destroy(struct policydb *p); extern int policydb_load_isids(struct policydb *p, struct sidtab *s); extern int policydb_context_isvalid(struct policydb *p, struct context *c); extern int policydb_class_isvalid(struct policydb *p, unsigned int class); extern int policydb_type_isvalid(struct policydb *p, unsigned int type); extern int policydb_role_isvalid(struct policydb *p, unsigned int role); extern int policydb_read(struct policydb *p, void *fp); extern int policydb_write(struct policydb *p, void *fp); extern struct filename_trans_datum *policydb_filenametr_search( struct policydb *p, struct filename_trans_key *key); extern struct mls_range *policydb_rangetr_search( struct policydb *p, struct range_trans *key); extern struct role_trans_datum *policydb_roletr_search( struct policydb *p, struct role_trans_key *key); #define POLICYDB_CONFIG_MLS 1 /* the config flags related to unknown classes/perms are bits 2 and 3 */ #define REJECT_UNKNOWN 0x00000002 #define ALLOW_UNKNOWN 0x00000004 #define OBJECT_R "object_r" #define OBJECT_R_VAL 1 #define POLICYDB_MAGIC SELINUX_MAGIC #define POLICYDB_STRING "SE Linux" struct policy_file { char *data; size_t len; }; struct policy_data { struct policydb *p; void *fp; }; static inline int next_entry(void *buf, struct policy_file *fp, size_t bytes) { if (bytes > fp->len) return -EINVAL; memcpy(buf, fp->data, bytes); fp->data += bytes; fp->len -= bytes; return 0; } static inline int put_entry(const void *buf, size_t bytes, int num, struct policy_file *fp) { size_t len = bytes * num; memcpy(fp->data, buf, len); fp->data += len; fp->len -= len; return 0; } static inline char *sym_name(struct policydb *p, unsigned int sym_num, unsigned int element_nr) { return p->sym_val_to_name[sym_num][element_nr]; } extern u16 string_to_security_class(struct policydb *p, const char *name); extern u32 string_to_av_perm(struct policydb *p, u16 tclass, const char *name); #endif /* _SS_POLICYDB_H_ */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 /* SPDX-License-Identifier: GPL-2.0-only */ /* * An interface between IEEE802.15.4 device and rest of the kernel. * * Copyright (C) 2007-2012 Siemens AG * * Written by: * Pavel Smolenskiy <pavel.smolenskiy@gmail.com> * Maxim Gorbachyov <maxim.gorbachev@siemens.com> * Maxim Osipov <maxim.osipov@siemens.com> * Dmitry Eremin-Solenikov <dbaryshkov@gmail.com> * Alexander Smirnov <alex.bluesman.smirnov@gmail.com> */ #ifndef IEEE802154_NETDEVICE_H #define IEEE802154_NETDEVICE_H #include <net/af_ieee802154.h> #include <linux/netdevice.h> #include <linux/skbuff.h> #include <linux/ieee802154.h> #include <net/cfg802154.h> struct ieee802154_sechdr { #if defined(__LITTLE_ENDIAN_BITFIELD) u8 level:3, key_id_mode:2, reserved:3; #elif defined(__BIG_ENDIAN_BITFIELD) u8 reserved:3, key_id_mode:2, level:3; #else #error "Please fix <asm/byteorder.h>" #endif u8 key_id; __le32 frame_counter; union { __le32 short_src; __le64 extended_src; }; }; struct ieee802154_hdr_fc { #if defined(__LITTLE_ENDIAN_BITFIELD) u16 type:3, security_enabled:1, frame_pending:1, ack_request:1, intra_pan:1, reserved:3, dest_addr_mode:2, version:2, source_addr_mode:2; #elif defined(__BIG_ENDIAN_BITFIELD) u16 reserved:1, intra_pan:1, ack_request:1, frame_pending:1, security_enabled:1, type:3, source_addr_mode:2, version:2, dest_addr_mode:2, reserved2:2; #else #error "Please fix <asm/byteorder.h>" #endif }; struct ieee802154_hdr { struct ieee802154_hdr_fc fc; u8 seq; struct ieee802154_addr source; struct ieee802154_addr dest; struct ieee802154_sechdr sec; }; /* pushes hdr onto the skb. fields of hdr->fc that can be calculated from * the contents of hdr will be, and the actual value of those bits in * hdr->fc will be ignored. this includes the INTRA_PAN bit and the frame * version, if SECEN is set. */ int ieee802154_hdr_push(struct sk_buff *skb, struct ieee802154_hdr *hdr); /* pulls the entire 802.15.4 header off of the skb, including the security * header, and performs pan id decompression */ int ieee802154_hdr_pull(struct sk_buff *skb, struct ieee802154_hdr *hdr); /* parses the frame control, sequence number of address fields in a given skb * and stores them into hdr, performing pan id decompression and length checks * to be suitable for use in header_ops.parse */ int ieee802154_hdr_peek_addrs(const struct sk_buff *skb, struct ieee802154_hdr *hdr); /* parses the full 802.15.4 header a given skb and stores them into hdr, * performing pan id decompression and length checks to be suitable for use in * header_ops.parse */ int ieee802154_hdr_peek(const struct sk_buff *skb, struct ieee802154_hdr *hdr); int ieee802154_max_payload(const struct ieee802154_hdr *hdr); static inline int ieee802154_sechdr_authtag_len(const struct ieee802154_sechdr *sec) { switch (sec->level) { case IEEE802154_SCF_SECLEVEL_MIC32: case IEEE802154_SCF_SECLEVEL_ENC_MIC32: return 4; case IEEE802154_SCF_SECLEVEL_MIC64: case IEEE802154_SCF_SECLEVEL_ENC_MIC64: return 8; case IEEE802154_SCF_SECLEVEL_MIC128: case IEEE802154_SCF_SECLEVEL_ENC_MIC128: return 16; case IEEE802154_SCF_SECLEVEL_NONE: case IEEE802154_SCF_SECLEVEL_ENC: default: return 0; } } static inline int ieee802154_hdr_length(struct sk_buff *skb) { struct ieee802154_hdr hdr; int len = ieee802154_hdr_pull(skb, &hdr); if (len > 0) skb_push(skb, len); return len; } static inline bool ieee802154_addr_equal(const struct ieee802154_addr *a1, const struct ieee802154_addr *a2) { if (a1->pan_id != a2->pan_id || a1->mode != a2->mode) return false; if ((a1->mode == IEEE802154_ADDR_LONG && a1->extended_addr != a2->extended_addr) || (a1->mode == IEEE802154_ADDR_SHORT && a1->short_addr != a2->short_addr)) return false; return true; } static inline __le64 ieee802154_devaddr_from_raw(const void *raw) { u64 temp; memcpy(&temp, raw, IEEE802154_ADDR_LEN); return (__force __le64)swab64(temp); } static inline void ieee802154_devaddr_to_raw(void *raw, __le64 addr) { u64 temp = swab64((__force u64)addr); memcpy(raw, &temp, IEEE802154_ADDR_LEN); } static inline void ieee802154_addr_from_sa(struct ieee802154_addr *a, const struct ieee802154_addr_sa *sa) { a->mode = sa->addr_type; a->pan_id = cpu_to_le16(sa->pan_id); switch (a->mode) { case IEEE802154_ADDR_SHORT: a->short_addr = cpu_to_le16(sa->short_addr); break; case IEEE802154_ADDR_LONG: a->extended_addr = ieee802154_devaddr_from_raw(sa->hwaddr); break; } } static inline void ieee802154_addr_to_sa(struct ieee802154_addr_sa *sa, const struct ieee802154_addr *a) { sa->addr_type = a->mode; sa->pan_id = le16_to_cpu(a->pan_id); switch (a->mode) { case IEEE802154_ADDR_SHORT: sa->short_addr = le16_to_cpu(a->short_addr); break; case IEEE802154_ADDR_LONG: ieee802154_devaddr_to_raw(sa->hwaddr, a->extended_addr); break; } } /* * A control block of skb passed between the ARPHRD_IEEE802154 device * and other stack parts. */ struct ieee802154_mac_cb { u8 lqi; u8 type; bool ackreq; bool secen; bool secen_override; u8 seclevel; bool seclevel_override; struct ieee802154_addr source; struct ieee802154_addr dest; }; static inline struct ieee802154_mac_cb *mac_cb(struct sk_buff *skb) { return (struct ieee802154_mac_cb *)skb->cb; } static inline struct ieee802154_mac_cb *mac_cb_init(struct sk_buff *skb) { BUILD_BUG_ON(sizeof(struct ieee802154_mac_cb) > sizeof(skb->cb)); memset(skb->cb, 0, sizeof(struct ieee802154_mac_cb)); return mac_cb(skb); } enum { IEEE802154_LLSEC_DEVKEY_IGNORE, IEEE802154_LLSEC_DEVKEY_RESTRICT, IEEE802154_LLSEC_DEVKEY_RECORD, __IEEE802154_LLSEC_DEVKEY_MAX, }; #define IEEE802154_MAC_SCAN_ED 0 #define IEEE802154_MAC_SCAN_ACTIVE 1 #define IEEE802154_MAC_SCAN_PASSIVE 2 #define IEEE802154_MAC_SCAN_ORPHAN 3 struct ieee802154_mac_params { s8 transmit_power; u8 min_be; u8 max_be; u8 csma_retries; s8 frame_retries; bool lbt; struct wpan_phy_cca cca; s32 cca_ed_level; }; struct wpan_phy; enum { IEEE802154_LLSEC_PARAM_ENABLED = BIT(0), IEEE802154_LLSEC_PARAM_FRAME_COUNTER = BIT(1), IEEE802154_LLSEC_PARAM_OUT_LEVEL = BIT(2), IEEE802154_LLSEC_PARAM_OUT_KEY = BIT(3), IEEE802154_LLSEC_PARAM_KEY_SOURCE = BIT(4), IEEE802154_LLSEC_PARAM_PAN_ID = BIT(5), IEEE802154_LLSEC_PARAM_HWADDR = BIT(6), IEEE802154_LLSEC_PARAM_COORD_HWADDR = BIT(7), IEEE802154_LLSEC_PARAM_COORD_SHORTADDR = BIT(8), }; struct ieee802154_llsec_ops { int (*get_params)(struct net_device *dev, struct ieee802154_llsec_params *params); int (*set_params)(struct net_device *dev, const struct ieee802154_llsec_params *params, int changed); int (*add_key)(struct net_device *dev, const struct ieee802154_llsec_key_id *id, const struct ieee802154_llsec_key *key); int (*del_key)(struct net_device *dev, const struct ieee802154_llsec_key_id *id); int (*add_dev)(struct net_device *dev, const struct ieee802154_llsec_device *llsec_dev); int (*del_dev)(struct net_device *dev, __le64 dev_addr); int (*add_devkey)(struct net_device *dev, __le64 device_addr, const struct ieee802154_llsec_device_key *key); int (*del_devkey)(struct net_device *dev, __le64 device_addr, const struct ieee802154_llsec_device_key *key); int (*add_seclevel)(struct net_device *dev, const struct ieee802154_llsec_seclevel *sl); int (*del_seclevel)(struct net_device *dev, const struct ieee802154_llsec_seclevel *sl); void (*lock_table)(struct net_device *dev); void (*get_table)(struct net_device *dev, struct ieee802154_llsec_table **t); void (*unlock_table)(struct net_device *dev); }; /* * This should be located at net_device->ml_priv * * get_phy should increment the reference counting on returned phy. * Use wpan_wpy_put to put that reference. */ struct ieee802154_mlme_ops { /* The following fields are optional (can be NULL). */ int (*assoc_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 channel, u8 page, u8 cap); int (*assoc_resp)(struct net_device *dev, struct ieee802154_addr *addr, __le16 short_addr, u8 status); int (*disassoc_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 reason); int (*start_req)(struct net_device *dev, struct ieee802154_addr *addr, u8 channel, u8 page, u8 bcn_ord, u8 sf_ord, u8 pan_coord, u8 blx, u8 coord_realign); int (*scan_req)(struct net_device *dev, u8 type, u32 channels, u8 page, u8 duration); int (*set_mac_params)(struct net_device *dev, const struct ieee802154_mac_params *params); void (*get_mac_params)(struct net_device *dev, struct ieee802154_mac_params *params); const struct ieee802154_llsec_ops *llsec; }; static inline struct ieee802154_mlme_ops * ieee802154_mlme_ops(const struct net_device *dev) { return dev->ml_priv; } #endif
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 /* SPDX-License-Identifier: GPL-2.0 */ #undef TRACE_SYSTEM #define TRACE_SYSTEM power #if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_POWER_H #include <linux/cpufreq.h> #include <linux/ktime.h> #include <linux/pm_qos.h> #include <linux/tracepoint.h> #include <linux/trace_events.h> #define TPS(x) tracepoint_string(x) DECLARE_EVENT_CLASS(cpu, TP_PROTO(unsigned int state, unsigned int cpu_id), TP_ARGS(state, cpu_id), TP_STRUCT__entry( __field( u32, state ) __field( u32, cpu_id ) ), TP_fast_assign( __entry->state = state; __entry->cpu_id = cpu_id; ), TP_printk("state=%lu cpu_id=%lu", (unsigned long)__entry->state, (unsigned long)__entry->cpu_id) ); DEFINE_EVENT(cpu, cpu_idle, TP_PROTO(unsigned int state, unsigned int cpu_id), TP_ARGS(state, cpu_id) ); TRACE_EVENT(powernv_throttle, TP_PROTO(int chip_id, const char *reason, int pmax), TP_ARGS(chip_id, reason, pmax), TP_STRUCT__entry( __field(int, chip_id) __string(reason, reason) __field(int, pmax) ), TP_fast_assign( __entry->chip_id = chip_id; __assign_str(reason, reason); __entry->pmax = pmax; ), TP_printk("Chip %d Pmax %d %s", __entry->chip_id, __entry->pmax, __get_str(reason)) ); TRACE_EVENT(pstate_sample, TP_PROTO(u32 core_busy, u32 scaled_busy, u32 from, u32 to, u64 mperf, u64 aperf, u64 tsc, u32 freq, u32 io_boost ), TP_ARGS(core_busy, scaled_busy, from, to, mperf, aperf, tsc, freq, io_boost ), TP_STRUCT__entry( __field(u32, core_busy) __field(u32, scaled_busy) __field(u32, from) __field(u32, to) __field(u64, mperf) __field(u64, aperf) __field(u64, tsc) __field(u32, freq) __field(u32, io_boost) ), TP_fast_assign( __entry->core_busy = core_busy; __entry->scaled_busy = scaled_busy; __entry->from = from; __entry->to = to; __entry->mperf = mperf; __entry->aperf = aperf; __entry->tsc = tsc; __entry->freq = freq; __entry->io_boost = io_boost; ), TP_printk("core_busy=%lu scaled=%lu from=%lu to=%lu mperf=%llu aperf=%llu tsc=%llu freq=%lu io_boost=%lu", (unsigned long)__entry->core_busy, (unsigned long)__entry->scaled_busy, (unsigned long)__entry->from, (unsigned long)__entry->to, (unsigned long long)__entry->mperf, (unsigned long long)__entry->aperf, (unsigned long long)__entry->tsc,