1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __LINUX_PREEMPT_H #define __LINUX_PREEMPT_H /* * include/linux/preempt.h - macros for accessing and manipulating * preempt_count (used for kernel preemption, interrupt count, etc.) */ #include <linux/linkage.h> #include <linux/list.h> /* * We put the hardirq and softirq counter into the preemption * counter. The bitmask has the following meaning: * * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with * interrupts disabled, so we cannot have nesting interrupts. Though * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_MASK: 0x000f0000 * NMI_MASK: 0x00f00000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_BITS 4 #define NMI_BITS 4 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS) #define HARDIRQ_SHIFT (SOFTIRQ_SHIFT + SOFTIRQ_BITS) #define NMI_SHIFT (HARDIRQ_SHIFT + HARDIRQ_BITS) #define __IRQ_MASK(x) ((1UL << (x))-1) #define PREEMPT_MASK (__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT) #define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT) #define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT) #define NMI_MASK (__IRQ_MASK(NMI_BITS) << NMI_SHIFT) #define PREEMPT_OFFSET (1UL << PREEMPT_SHIFT) #define SOFTIRQ_OFFSET (1UL << SOFTIRQ_SHIFT) #define HARDIRQ_OFFSET (1UL << HARDIRQ_SHIFT) #define NMI_OFFSET (1UL << NMI_SHIFT) #define SOFTIRQ_DISABLE_OFFSET (2 * SOFTIRQ_OFFSET) #define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* * Disable preemption until the scheduler is running -- use an unconditional * value so that it also works on !PREEMPT_COUNT kernels. * * Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count(). */ #define INIT_PREEMPT_COUNT PREEMPT_OFFSET /* * Initial preempt_count value; reflects the preempt_count schedule invariant * which states that during context switches: * * preempt_count() == 2*PREEMPT_DISABLE_OFFSET * * Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels. * Note: See finish_task_switch(). */ #define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED) /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #define softirq_count() (preempt_count() & SOFTIRQ_MASK) #define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ | NMI_MASK)) /* * Are we doing bottom half or hardware interrupt processing? * * in_irq() - We're in (hard) IRQ context * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled * in_serving_softirq() - We're in softirq context * in_nmi() - We're in NMI context * in_task() - We're in task context * * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really * should not be used in new code. */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) #define in_nmi() (preempt_count() & NMI_MASK) #define in_task() (!(preempt_count() & \ (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) /* * The preempt_count offset after preempt_disable(); */ #if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_DISABLE_OFFSET PREEMPT_OFFSET #else # define PREEMPT_DISABLE_OFFSET 0 #endif /* * The preempt_count offset after spin_lock() */ #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET /* * The preempt_count offset needed for things like: * * spin_lock_bh() * * Which need to disable both preemption (CONFIG_PREEMPT_COUNT) and * softirqs, such that unlock sequences of: * * spin_unlock(); * local_bh_enable(); * * Work as expected. */ #define SOFTIRQ_LOCK_OFFSET (SOFTIRQ_DISABLE_OFFSET + PREEMPT_LOCK_OFFSET) /* * Are we running in atomic context? WARNING: this macro cannot * always detect atomic context; in particular, it cannot know about * held spinlocks in non-preemptible kernels. Thus it should not be * used in the general case to determine whether sleeping is possible. * Do not use in_atomic() in driver code. */ #define in_atomic() (preempt_count() != 0) /* * Check whether we were atomic before we did preempt_disable(): * (used by the scheduler) */ #define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET) #if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_TRACE_PREEMPT_TOGGLE) extern void preempt_count_add(int val); extern void preempt_count_sub(int val); #define preempt_count_dec_and_test() \ ({ preempt_count_sub(1); should_resched(0); }) #else #define preempt_count_add(val) __preempt_count_add(val) #define preempt_count_sub(val) __preempt_count_sub(val) #define preempt_count_dec_and_test() __preempt_count_dec_and_test() #endif #define __preempt_count_inc() __preempt_count_add(1) #define __preempt_count_dec() __preempt_count_sub(1) #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ do { \ preempt_count_inc(); \ barrier(); \ } while (0) #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_no_resched() sched_preempt_enable_no_resched() #define preemptible() (preempt_count() == 0 && !irqs_disabled()) #ifdef CONFIG_PREEMPTION #define preempt_enable() \ do { \ barrier(); \ if (unlikely(preempt_count_dec_and_test())) \ __preempt_schedule(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ if (unlikely(__preempt_count_dec_and_test())) \ __preempt_schedule_notrace(); \ } while (0) #define preempt_check_resched() \ do { \ if (should_resched(0)) \ __preempt_schedule(); \ } while (0) #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) #define preempt_enable_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #define preempt_check_resched() do { } while (0) #endif /* CONFIG_PREEMPTION */ #define preempt_disable_notrace() \ do { \ __preempt_count_inc(); \ barrier(); \ } while (0) #define preempt_enable_no_resched_notrace() \ do { \ barrier(); \ __preempt_count_dec(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ /* * Even if we don't have any preemption, we need preempt disable/enable * to be barriers, so that we don't have things like get_user/put_user * that can cause faults and scheduling migrate into our preempt-protected * region. */ #define preempt_disable() barrier() #define sched_preempt_enable_no_resched() barrier() #define preempt_enable_no_resched() barrier() #define preempt_enable() barrier() #define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() #define preemptible() 0 #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE /* * Modules have no business playing preemption tricks. */ #undef sched_preempt_enable_no_resched #undef preempt_enable_no_resched #undef preempt_enable_no_resched_notrace #undef preempt_check_resched #endif #define preempt_set_need_resched() \ do { \ set_preempt_need_resched(); \ } while (0) #define preempt_fold_need_resched() \ do { \ if (tif_need_resched()) \ set_preempt_need_resched(); \ } while (0) #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; /** * preempt_ops - notifiers called when a task is preempted and rescheduled * @sched_in: we're about to be rescheduled: * notifier: struct preempt_notifier for the task being scheduled * cpu: cpu we're scheduled on * @sched_out: we've just been preempted * notifier: struct preempt_notifier for the task being preempted * next: the task that's kicking us out * * Please note that sched_in and out are called under different * contexts. sched_out is called with rq lock held and irq disabled * while sched_in is called without rq lock and irq enabled. This * difference is intentional and depended upon by its users. */ struct preempt_ops { void (*sched_in)(struct preempt_notifier *notifier, int cpu); void (*sched_out)(struct preempt_notifier *notifier, struct task_struct *next); }; /** * preempt_notifier - key for installing preemption notifiers * @link: internal use * @ops: defines the notifier functions to be called * * Usually used in conjunction with container_of(). */ struct preempt_notifier { struct hlist_node link; struct preempt_ops *ops; }; void preempt_notifier_inc(void); void preempt_notifier_dec(void); void preempt_notifier_register(struct preempt_notifier *notifier); void preempt_notifier_unregister(struct preempt_notifier *notifier); static inline void preempt_notifier_init(struct preempt_notifier *notifier, struct preempt_ops *ops) { INIT_HLIST_NODE(&notifier->link); notifier->ops = ops; } #endif /** * migrate_disable - Prevent migration of the current task * * Maps to preempt_disable() which also disables preemption. Use * migrate_disable() to annotate that the intent is to prevent migration, * but not necessarily preemption. * * Can be invoked nested like preempt_disable() and needs the corresponding * number of migrate_enable() invocations. */ static __always_inline void migrate_disable(void) { preempt_disable(); } /** * migrate_enable - Allow migration of the current task * * Counterpart to migrate_disable(). * * As migrate_disable() can be invoked nested, only the outermost invocation * reenables migration. * * Currently mapped to preempt_enable(). */ static __always_inline void migrate_enable(void) { preempt_enable(); } #endif /* __LINUX_PREEMPT_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 /* SPDX-License-Identifier: GPL-2.0-only */ /* * V9FS definitions. * * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> */ #ifndef FS_9P_V9FS_H #define FS_9P_V9FS_H #include <linux/backing-dev.h> /** * enum p9_session_flags - option flags for each 9P session * @V9FS_PROTO_2000U: whether or not to use 9P2000.u extensions * @V9FS_PROTO_2000L: whether or not to use 9P2000.l extensions * @V9FS_ACCESS_SINGLE: only the mounting user can access the hierarchy * @V9FS_ACCESS_USER: a new attach will be issued for every user (default) * @V9FS_ACCESS_CLIENT: Just like user, but access check is performed on client. * @V9FS_ACCESS_ANY: use a single attach for all users * @V9FS_ACCESS_MASK: bit mask of different ACCESS options * @V9FS_POSIX_ACL: POSIX ACLs are enforced * * Session flags reflect options selected by users at mount time */ #define V9FS_ACCESS_ANY (V9FS_ACCESS_SINGLE | \ V9FS_ACCESS_USER | \ V9FS_ACCESS_CLIENT) #define V9FS_ACCESS_MASK V9FS_ACCESS_ANY #define V9FS_ACL_MASK V9FS_POSIX_ACL enum p9_session_flags { V9FS_PROTO_2000U = 0x01, V9FS_PROTO_2000L = 0x02, V9FS_ACCESS_SINGLE = 0x04, V9FS_ACCESS_USER = 0x08, V9FS_ACCESS_CLIENT = 0x10, V9FS_POSIX_ACL = 0x20 }; /* possible values of ->cache */ /** * enum p9_cache_modes - user specified cache preferences * @CACHE_NONE: do not cache data, dentries, or directory contents (default) * @CACHE_LOOSE: cache data, dentries, and directory contents w/no consistency * * eventually support loose, tight, time, session, default always none */ enum p9_cache_modes { CACHE_NONE, CACHE_MMAP, CACHE_LOOSE, CACHE_FSCACHE, nr__p9_cache_modes }; /** * struct v9fs_session_info - per-instance session information * @flags: session options of type &p9_session_flags * @nodev: set to 1 to disable device mapping * @debug: debug level * @afid: authentication handle * @cache: cache mode of type &p9_cache_modes * @cachetag: the tag of the cache associated with this session * @fscache: session cookie associated with FS-Cache * @uname: string user name to mount hierarchy as * @aname: mount specifier for remote hierarchy * @maxdata: maximum data to be sent/recvd per protocol message * @dfltuid: default numeric userid to mount hierarchy as * @dfltgid: default numeric groupid to mount hierarchy as * @uid: if %V9FS_ACCESS_SINGLE, the numeric uid which mounted the hierarchy * @clnt: reference to 9P network client instantiated for this session * @slist: reference to list of registered 9p sessions * * This structure holds state for each session instance established during * a sys_mount() . * * Bugs: there seems to be a lot of state which could be condensed and/or * removed. */ struct v9fs_session_info { /* options */ unsigned char flags; unsigned char nodev; unsigned short debug; unsigned int afid; unsigned int cache; #ifdef CONFIG_9P_FSCACHE char *cachetag; struct fscache_cookie *fscache; #endif char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ kuid_t dfltuid; /* default uid/muid for legacy support */ kgid_t dfltgid; /* default gid for legacy support */ kuid_t uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ struct rw_semaphore rename_sem; long session_lock_timeout; /* retry interval for blocking locks */ }; /* cache_validity flags */ #define V9FS_INO_INVALID_ATTR 0x01 struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE struct mutex fscache_lock; struct fscache_cookie *fscache; #endif struct p9_qid qid; unsigned int cache_validity; struct p9_fid *writeback_fid; struct mutex v_mutex; struct inode vfs_inode; }; static inline struct v9fs_inode *V9FS_I(const struct inode *inode) { return container_of(inode, struct v9fs_inode, vfs_inode); } extern int v9fs_show_options(struct seq_file *m, struct dentry *root); struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, char *); extern void v9fs_session_close(struct v9fs_session_info *v9ses); extern void v9fs_session_cancel(struct v9fs_session_info *v9ses); extern void v9fs_session_begin_cancel(struct v9fs_session_info *v9ses); extern struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags); extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); extern const struct inode_operations v9fs_dir_inode_operations_dotl; extern const struct inode_operations v9fs_file_inode_operations_dotl; extern const struct inode_operations v9fs_symlink_inode_operations_dotl; extern struct inode *v9fs_inode_from_fid_dotl(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); /* other default globals */ #define V9FS_PORT 564 #define V9FS_DEFUSER "nobody" #define V9FS_DEFANAME "" #define V9FS_DEFUID KUIDT_INIT(-2) #define V9FS_DEFGID KGIDT_INIT(-2) static inline struct v9fs_session_info *v9fs_inode2v9ses(struct inode *inode) { return (inode->i_sb->s_fs_info); } static inline struct v9fs_session_info *v9fs_dentry2v9ses(struct dentry *dentry) { return dentry->d_sb->s_fs_info; } static inline int v9fs_proto_dotu(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000U; } static inline int v9fs_proto_dotl(struct v9fs_session_info *v9ses) { return v9ses->flags & V9FS_PROTO_2000L; } /** * v9fs_get_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 0); else return v9fs_inode_from_fid(v9ses, fid, sb, 0); } /** * v9fs_get_new_inode_from_fid - Helper routine to populate an inode by * issuing a attribute request * @v9ses: session information * @fid: fid to issue attribute request for * @sb: superblock on which to create inode * */ static inline struct inode * v9fs_get_new_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb) { if (v9fs_proto_dotl(v9ses)) return v9fs_inode_from_fid_dotl(v9ses, fid, sb, 1); else return v9fs_inode_from_fid(v9ses, fid, sb, 1); } #endif
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _LINUX_UNALIGNED_ACCESS_OK_H #define _LINUX_UNALIGNED_ACCESS_OK_H #include <linux/kernel.h> #include <asm/byteorder.h> static __always_inline u16 get_unaligned_le16(const void *p) { return le16_to_cpup((__le16 *)p); } static __always_inline u32 get_unaligned_le32(const void *p) { return le32_to_cpup((__le32 *)p); } static __always_inline u64 get_unaligned_le64(const void *p) { return le64_to_cpup((__le64 *)p); } static __always_inline u16 get_unaligned_be16(const void *p) { return be16_to_cpup((__be16 *)p); } static __always_inline u32 get_unaligned_be32(const void *p) { return be32_to_cpup((__be32 *)p); } static __always_inline u64 get_unaligned_be64(const void *p) { return be64_to_cpup((__be64 *)p); } static __always_inline void put_unaligned_le16(u16 val, void *p) { *((__le16 *)p) = cpu_to_le16(val); } static __always_inline void put_unaligned_le32(u32 val, void *p) { *((__le32 *)p) = cpu_to_le32(val); } static __always_inline void put_unaligned_le64(u64 val, void *p) { *((__le64 *)p) = cpu_to_le64(val); } static __always_inline void put_unaligned_be16(u16 val, void *p) { *((__be16 *)p) = cpu_to_be16(val); } static __always_inline void put_unaligned_be32(u32 val, void *p) { *((__be32 *)p) = cpu_to_be32(val); } static __always_inline void put_unaligned_be64(u64 val, void *p) { *((__be64 *)p) = cpu_to_be64(val); } #endif /* _LINUX_UNALIGNED_ACCESS_OK_H */
2 2 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 // SPDX-License-Identifier: GPL-2.0-only #include <linux/wait.h> #include <linux/rbtree.h> #include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> #include <linux/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> #include <trace/events/writeback.h> struct backing_dev_info noop_backing_dev_info; EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; static const char *bdi_unknown_name = "(unknown)"; /* * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); static u64 bdi_id_cursor; static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ struct workqueue_struct *bdi_wq; #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> static struct dentry *bdi_debug_root; static void bdi_debug_init(void) { bdi_debug_root = debugfs_create_dir("bdi", NULL); } static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long wb_thresh; unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time; struct inode *inode; nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); list_for_each_entry(inode, &wb->b_dirty, i_io_list) nr_dirty++; list_for_each_entry(inode, &wb->b_io, i_io_list) nr_io++; list_for_each_entry(inode, &wb->b_more_io, i_io_list) nr_more_io++; list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) nr_dirty_time++; spin_unlock(&wb->list_lock); global_dirty_limits(&background_thresh, &dirty_thresh); wb_thresh = wb_calc_thresh(wb, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, "BdiWriteback: %10lu kB\n" "BdiReclaimable: %10lu kB\n" "BdiDirtyThresh: %10lu kB\n" "DirtyThresh: %10lu kB\n" "BackgroundThresh: %10lu kB\n" "BdiDirtied: %10lu kB\n" "BdiWritten: %10lu kB\n" "BdiWriteBandwidth: %10lu kBps\n" "b_dirty: %10lu\n" "b_io: %10lu\n" "b_more_io: %10lu\n" "b_dirty_time: %10lu\n" "bdi_list: %10u\n" "state: %10lx\n", (unsigned long) K(wb_stat(wb, WB_WRITEBACK)), (unsigned long) K(wb_stat(wb, WB_RECLAIMABLE)), K(wb_thresh), K(dirty_thresh), K(background_thresh), (unsigned long) K(wb_stat(wb, WB_DIRTIED)), (unsigned long) K(wb_stat(wb, WB_WRITTEN)), (unsigned long) K(wb->write_bandwidth), nr_dirty, nr_io, nr_more_io, nr_dirty_time, !list_empty(&bdi->bdi_list), bdi->wb.state); #undef K return 0; } DEFINE_SHOW_ATTRIBUTE(bdi_debug_stats); static void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { bdi->debug_dir = debugfs_create_dir(name, bdi_debug_root); debugfs_create_file("stats", 0444, bdi->debug_dir, bdi, &bdi_debug_stats_fops); } static void bdi_debug_unregister(struct backing_dev_info *bdi) { debugfs_remove_recursive(bdi->debug_dir); } #else static inline void bdi_debug_init(void) { } static inline void bdi_debug_register(struct backing_dev_info *bdi, const char *name) { } static inline void bdi_debug_unregister(struct backing_dev_info *bdi) { } #endif static ssize_t read_ahead_kb_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned long read_ahead_kb; ssize_t ret; ret = kstrtoul(buf, 10, &read_ahead_kb); if (ret < 0) return ret; bdi->ra_pages = read_ahead_kb >> (PAGE_SHIFT - 10); return count; } #define K(pages) ((pages) << (PAGE_SHIFT - 10)) #define BDI_SHOW(name, expr) \ static ssize_t name##_show(struct device *dev, \ struct device_attribute *attr, char *page) \ { \ struct backing_dev_info *bdi = dev_get_drvdata(dev); \ \ return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \ } \ static DEVICE_ATTR_RW(name); BDI_SHOW(read_ahead_kb, K(bdi->ra_pages)) static ssize_t min_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_min_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(min_ratio, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { struct backing_dev_info *bdi = dev_get_drvdata(dev); unsigned int ratio; ssize_t ret; ret = kstrtouint(buf, 10, &ratio); if (ret < 0) return ret; ret = bdi_set_max_ratio(bdi, ratio); if (!ret) ret = count; return ret; } BDI_SHOW(max_ratio, bdi->max_ratio) static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, char *page) { dev_warn_once(dev, "the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n"); return snprintf(page, PAGE_SIZE-1, "%d\n", 0); } static DEVICE_ATTR_RO(stable_pages_required); static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, &dev_attr_max_ratio.attr, &dev_attr_stable_pages_required.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); static __init int bdi_class_init(void) { bdi_class = class_create(THIS_MODULE, "bdi"); if (IS_ERR(bdi_class)) return PTR_ERR(bdi_class); bdi_class->dev_groups = bdi_dev_groups; bdi_debug_init(); return 0; } postcore_initcall(bdi_class_init); static int bdi_init(struct backing_dev_info *bdi); static int __init default_bdi_init(void) { int err; bdi_wq = alloc_workqueue("writeback", WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_SYSFS, 0); if (!bdi_wq) return -ENOMEM; err = bdi_init(&noop_backing_dev_info); return err; } subsys_initcall(default_bdi_init); /* * This function is used when the first inode for this wb is marked dirty. It * wakes-up the corresponding bdi thread which should then take care of the * periodic background write-out of dirty inodes. Since the write-out would * starts only 'dirty_writeback_interval' centisecs from now anyway, we just * set up a timer which wakes the bdi thread up later. * * Note, we wouldn't bother setting up the timer, but this function is on the * fast-path (used by '__mark_inode_dirty()'), so we save few context switches * by delaying the wake-up. * * We have to be careful not to postpone flush work if it is scheduled for * earlier. Thus we use queue_delayed_work(). */ void wb_wakeup_delayed(struct bdi_writeback *wb) { unsigned long timeout; timeout = msecs_to_jiffies(dirty_writeback_interval * 10); spin_lock_bh(&wb->work_lock); if (test_bit(WB_registered, &wb->state)) queue_delayed_work(bdi_wq, &wb->dwork, timeout); spin_unlock_bh(&wb->work_lock); } /* * Initial write bandwidth: 100 MB/s */ #define INIT_BW (100 << (20 - PAGE_SHIFT)) static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, gfp_t gfp) { int i, err; memset(wb, 0, sizeof(*wb)); if (wb != &bdi->wb) bdi_get(bdi); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); INIT_LIST_HEAD(&wb->b_io); INIT_LIST_HEAD(&wb->b_more_io); INIT_LIST_HEAD(&wb->b_dirty_time); spin_lock_init(&wb->list_lock); wb->bw_time_stamp = jiffies; wb->balanced_dirty_ratelimit = INIT_BW; wb->dirty_ratelimit = INIT_BW; wb->write_bandwidth = INIT_BW; wb->avg_write_bandwidth = INIT_BW; spin_lock_init(&wb->work_lock); INIT_LIST_HEAD(&wb->work_list); INIT_DELAYED_WORK(&wb->dwork, wb_workfn); wb->dirty_sleep = jiffies; err = fprop_local_init_percpu(&wb->completions, gfp); if (err) goto out_put_bdi; for (i = 0; i < NR_WB_STAT_ITEMS; i++) { err = percpu_counter_init(&wb->stat[i], 0, gfp); if (err) goto out_destroy_stat; } return 0; out_destroy_stat: while (i--) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); out_put_bdi: if (wb != &bdi->wb) bdi_put(bdi); return err; } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); /* * Remove bdi from the global list and shutdown any threads we have running */ static void wb_shutdown(struct bdi_writeback *wb) { /* Make sure nobody queues further work */ spin_lock_bh(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_bh(&wb->work_lock); return; } spin_unlock_bh(&wb->work_lock); cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to * be drained no matter what. */ mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); } static void wb_exit(struct bdi_writeback *wb) { int i; WARN_ON(delayed_work_pending(&wb->dwork)); for (i = 0; i < NR_WB_STAT_ITEMS; i++) percpu_counter_destroy(&wb->stat[i]); fprop_local_destroy_percpu(&wb->completions); if (wb != &wb->bdi->wb) bdi_put(wb->bdi); } #ifdef CONFIG_CGROUP_WRITEBACK #include <linux/memcontrol.h> /* * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list. * bdi->cgwb_tree is also RCU protected. */ static DEFINE_SPINLOCK(cgwb_lock); static struct workqueue_struct *cgwb_release_wq; static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); struct blkcg *blkcg = css_to_blkcg(wb->blkcg_css); mutex_lock(&wb->bdi->cgwb_release_mutex); wb_shutdown(wb); css_put(wb->memcg_css); css_put(wb->blkcg_css); mutex_unlock(&wb->bdi->cgwb_release_mutex); /* triggers blkg destruction if no online users left */ blkcg_unpin_online(blkcg); fprop_local_destroy_percpu(&wb->memcg_completions); percpu_ref_exit(&wb->refcnt); wb_exit(wb); kfree_rcu(wb, rcu); } static void cgwb_release(struct percpu_ref *refcnt) { struct bdi_writeback *wb = container_of(refcnt, struct bdi_writeback, refcnt); queue_work(cgwb_release_wq, &wb->release_work); } static void cgwb_kill(struct bdi_writeback *wb) { lockdep_assert_held(&cgwb_lock); WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->memcg_css->id)); list_del(&wb->memcg_node); list_del(&wb->blkcg_node); percpu_ref_kill(&wb->refcnt); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { spin_lock_irq(&cgwb_lock); list_del_rcu(&wb->bdi_node); spin_unlock_irq(&cgwb_lock); } static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct mem_cgroup *memcg; struct cgroup_subsys_state *blkcg_css; struct blkcg *blkcg; struct list_head *memcg_cgwb_list, *blkcg_cgwb_list; struct bdi_writeback *wb; unsigned long flags; int ret = 0; memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; /* look up again under lock and discard on blkcg mismatch */ spin_lock_irqsave(&cgwb_lock, flags); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb && wb->blkcg_css != blkcg_css) { cgwb_kill(wb); wb = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); if (wb) goto out_put; /* need to create a new one */ wb = kmalloc(sizeof(*wb), gfp); if (!wb) { ret = -ENOMEM; goto out_put; } ret = wb_init(wb, bdi, gfp); if (ret) goto err_free; ret = percpu_ref_init(&wb->refcnt, cgwb_release, 0, gfp); if (ret) goto err_wb_exit; ret = fprop_local_init_percpu(&wb->memcg_completions, gfp); if (ret) goto err_ref_exit; wb->memcg_css = memcg_css; wb->blkcg_css = blkcg_css; INIT_WORK(&wb->release_work, cgwb_release_workfn); set_bit(WB_registered, &wb->state); /* * The root wb determines the registered state of the whole bdi and * memcg_cgwb_list and blkcg_cgwb_list's next pointers indicate * whether they're still online. Don't link @wb if any is dead. * See wb_memcg_offline() and wb_blkcg_offline(). */ ret = -ENODEV; spin_lock_irqsave(&cgwb_lock, flags); if (test_bit(WB_registered, &bdi->wb.state) && blkcg_cgwb_list->next && memcg_cgwb_list->next) { /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); blkcg_pin_online(blkcg); css_get(memcg_css); css_get(blkcg_css); } } spin_unlock_irqrestore(&cgwb_lock, flags); if (ret) { if (ret == -EEXIST) ret = 0; goto err_fprop_exit; } goto out_put; err_fprop_exit: fprop_local_destroy_percpu(&wb->memcg_completions); err_ref_exit: percpu_ref_exit(&wb->refcnt); err_wb_exit: wb_exit(wb); err_free: kfree(wb); out_put: css_put(blkcg_css); return ret; } /** * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * * Try to get the wb for @memcg_css on @bdi. The returned wb has its * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on * @memcg_css isn't enough. try_get it before calling this function. * * A wb is keyed by its associated memcg. As blkcg implicitly enables * memcg on the default hierarchy, memcg association is guaranteed to be * more specific (equal or descendant to the associated blkcg) and thus can * identify both the memcg and blkcg associations. * * Because the blkcg associated with a memcg may change as blkcg is enabled * and disabled closer to root in the hierarchy, each wb keeps track of * both the memcg and blkcg associated with it and verifies the blkcg on * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css) { struct bdi_writeback *wb; if (!memcg_css->parent) return &bdi->wb; rcu_read_lock(); wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); if (wb) { struct cgroup_subsys_state *blkcg_css; /* see whether the blkcg association has changed */ blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) wb = NULL; css_put(blkcg_css); } rcu_read_unlock(); return wb; } /** * wb_get_create - get wb for a given memcg, create if necessary * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) * @gfp: allocation mask to use * * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to * create one. See wb_get_lookup() for more details. */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { struct bdi_writeback *wb; might_sleep_if(gfpflags_allow_blocking(gfp)); if (!memcg_css->parent) return &bdi->wb; do { wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; } static int cgwb_bdi_init(struct backing_dev_info *bdi) { int ret; INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); mutex_init(&bdi->cgwb_release_mutex); init_rwsem(&bdi->wb_switch_rwsem); ret = wb_init(&bdi->wb, bdi, GFP_KERNEL); if (!ret) { bdi->wb.memcg_css = &root_mem_cgroup->css; bdi->wb.blkcg_css = blkcg_root_css; } return ret; } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); spin_unlock_irq(&cgwb_lock); mutex_lock(&bdi->cgwb_release_mutex); spin_lock_irq(&cgwb_lock); while (!list_empty(&bdi->wb_list)) { wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, bdi_node); spin_unlock_irq(&cgwb_lock); wb_shutdown(wb); spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); mutex_unlock(&bdi->cgwb_release_mutex); } /** * wb_memcg_offline - kill all wb's associated with a memcg being offlined * @memcg: memcg being offlined * * Also prevents creation of any new wb's associated with @memcg. */ void wb_memcg_offline(struct mem_cgroup *memcg) { struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, memcg_cgwb_list, memcg_node) cgwb_kill(wb); memcg_cgwb_list->next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } /** * wb_blkcg_offline - kill all wb's associated with a blkcg being offlined * @blkcg: blkcg being offlined * * Also prevents creation of any new wb's associated with @blkcg. */ void wb_blkcg_offline(struct blkcg *blkcg) { struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node) cgwb_kill(wb); blkcg->cgwb_list.next = NULL; /* prevent new wb's */ spin_unlock_irq(&cgwb_lock); } static void cgwb_bdi_register(struct backing_dev_info *bdi) { spin_lock_irq(&cgwb_lock); list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); spin_unlock_irq(&cgwb_lock); } static int __init cgwb_init(void) { /* * There can be many concurrent release work items overwhelming * system_wq. Put them in a separate wq and limit concurrency. * There's no point in executing many of these in parallel. */ cgwb_release_wq = alloc_workqueue("cgwb_release", 0, 1); if (!cgwb_release_wq) return -ENOMEM; return 0; } subsys_initcall(cgwb_init); #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) { return wb_init(&bdi->wb, bdi, GFP_KERNEL); } static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_register(struct backing_dev_info *bdi) { list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); } static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) { list_del_rcu(&wb->bdi_node); } #endif /* CONFIG_CGROUP_WRITEBACK */ static int bdi_init(struct backing_dev_info *bdi) { int ret; bdi->dev = NULL; kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); init_waitqueue_head(&bdi->wb_waitq); ret = cgwb_bdi_init(bdi); return ret; } struct backing_dev_info *bdi_alloc(int node_id) { struct backing_dev_info *bdi; bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id); if (!bdi) return NULL; if (bdi_init(bdi)) { kfree(bdi); return NULL; } bdi->capabilities = BDI_CAP_WRITEBACK | BDI_CAP_WRITEBACK_ACCT; bdi->ra_pages = VM_READAHEAD_PAGES; bdi->io_pages = VM_READAHEAD_PAGES; return bdi; } EXPORT_SYMBOL(bdi_alloc); static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) { struct rb_node **p = &bdi_tree.rb_node; struct rb_node *parent = NULL; struct backing_dev_info *bdi; lockdep_assert_held(&bdi_lock); while (*p) { parent = *p; bdi = rb_entry(parent, struct backing_dev_info, rb_node); if (bdi->id > id) p = &(*p)->rb_left; else if (bdi->id < id) p = &(*p)->rb_right; else break; } if (parentp) *parentp = parent; return p; } /** * bdi_get_by_id - lookup and get bdi from its id * @id: bdi id to lookup * * Find bdi matching @id and get it. Returns NULL if the matching bdi * doesn't exist or is already unregistered. */ struct backing_dev_info *bdi_get_by_id(u64 id) { struct backing_dev_info *bdi = NULL; struct rb_node **p; spin_lock_bh(&bdi_lock); p = bdi_lookup_rb_node(id, NULL); if (*p) { bdi = rb_entry(*p, struct backing_dev_info, rb_node); bdi_get(bdi); } spin_unlock_bh(&bdi_lock); return bdi; } int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; vsnprintf(bdi->dev_name, sizeof(bdi->dev_name), fmt, args); dev = device_create(bdi_class, NULL, MKDEV(0, 0), bdi, bdi->dev_name); if (IS_ERR(dev)) return PTR_ERR(dev); cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); bdi->id = ++bdi_id_cursor; p = bdi_lookup_rb_node(bdi->id, &parent); rb_link_node(&bdi->rb_node, parent, p); rb_insert_color(&bdi->rb_node, &bdi_tree); list_add_tail_rcu(&bdi->bdi_list, &bdi_list); spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); return 0; } int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...) { va_list args; int ret; va_start(args, fmt); ret = bdi_register_va(bdi, fmt, args); va_end(args); return ret; } EXPORT_SYMBOL(bdi_register); void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner) { WARN_ON_ONCE(bdi->owner); bdi->owner = owner; get_device(owner); } /* * Remove bdi from bdi_list, and ensure that it is no longer visible */ static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); synchronize_rcu_expedited(); } void bdi_unregister(struct backing_dev_info *bdi) { /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); cgwb_bdi_unregister(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; } if (bdi->owner) { put_device(bdi->owner); bdi->owner = NULL; } } static void release_bdi(struct kref *ref) { struct backing_dev_info *bdi = container_of(ref, struct backing_dev_info, refcnt); if (test_bit(WB_registered, &bdi->wb.state)) bdi_unregister(bdi); WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); kfree(bdi); } void bdi_put(struct backing_dev_info *bdi) { kref_put(&bdi->refcnt, release_bdi); } EXPORT_SYMBOL(bdi_put); const char *bdi_dev_name(struct backing_dev_info *bdi) { if (!bdi || !bdi->dev) return bdi_unknown_name; return bdi->dev_name; } EXPORT_SYMBOL_GPL(bdi_dev_name); static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; static atomic_t nr_wb_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { wait_queue_head_t *wqh = &congestion_wqh[sync]; enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; if (test_and_clear_bit(bit, &bdi->wb.congested)) atomic_dec(&nr_wb_congested[sync]); smp_mb__after_atomic(); if (waitqueue_active(wqh)) wake_up(wqh); } EXPORT_SYMBOL(clear_bdi_congested); void set_bdi_congested(struct backing_dev_info *bdi, int sync) { enum wb_congested_state bit; bit = sync ? WB_sync_congested : WB_async_congested; if (!test_and_set_bit(bit, &bdi->wb.congested)) atomic_inc(&nr_wb_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ long congestion_wait(int sync, long timeout) { long ret; unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); trace_writeback_congestion_wait(jiffies_to_usecs(timeout), jiffies_to_usecs(jiffies - start)); return ret; } EXPORT_SYMBOL(congestion_wait); /** * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a pgdat to complete writes * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * In the event of a congested backing_dev (any backing_dev) this waits * for up to @timeout jiffies for either a BDI to exit congestion of the * given @sync queue or a write to complete. * * The return value is 0 if the sleep is for the full timeout. Otherwise, * it is the number of jiffies that were still remaining when the function * returned. return_value == timeout implies the function did not sleep. */ long wait_iff_congested(int sync, long timeout) { long ret; unsigned long start = jiffies; DEFINE_WAIT(wait); wait_queue_head_t *wqh = &congestion_wqh[sync]; /* * If there is no congestion, yield if necessary instead * of sleeping on the congestion queue */ if (atomic_read(&nr_wb_congested[sync]) == 0) { cond_resched(); /* In case we scheduled, work out time remaining */ ret = timeout - (jiffies - start); if (ret < 0) ret = 0; goto out; } /* Sleep until uncongested or a write happens */ prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); out: trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), jiffies_to_usecs(jiffies - start)); return ret; } EXPORT_SYMBOL(wait_iff_congested);
1 1 1 1 1 1 1 1 1 1 1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef __NET_NETLINK_H #define __NET_NETLINK_H #include <linux/types.h> #include <linux/netlink.h> #include <linux/jiffies.h> #include <linux/in6.h> /* ======================================================================== * Netlink Messages and Attributes Interface (As Seen On TV) * ------------------------------------------------------------------------ * Messages Interface * ------------------------------------------------------------------------ * * Message Format: * <--- nlmsg_total_size(payload) ---> * <-- nlmsg_msg_size(payload) -> * +----------+- - -+-------------+- - -+-------- - - * | nlmsghdr | Pad | Payload | Pad | nlmsghdr * +----------+- - -+-------------+- - -+-------- - - * nlmsg_data(nlh)---^ ^ * nlmsg_next(nlh)-----------------------+ * * Payload Format: * <---------------------- nlmsg_len(nlh) ---------------------> * <------ hdrlen ------> <- nlmsg_attrlen(nlh, hdrlen) -> * +----------------------+- - -+--------------------------------+ * | Family Header | Pad | Attributes | * +----------------------+- - -+--------------------------------+ * nlmsg_attrdata(nlh, hdrlen)---^ * * Data Structures: * struct nlmsghdr netlink message header * * Message Construction: * nlmsg_new() create a new netlink message * nlmsg_put() add a netlink message to an skb * nlmsg_put_answer() callback based nlmsg_put() * nlmsg_end() finalize netlink message * nlmsg_get_pos() return current position in message * nlmsg_trim() trim part of message * nlmsg_cancel() cancel message construction * nlmsg_free() free a netlink message * * Message Sending: * nlmsg_multicast() multicast message to several groups * nlmsg_unicast() unicast a message to a single socket * nlmsg_notify() send notification message * * Message Length Calculations: * nlmsg_msg_size(payload) length of message w/o padding * nlmsg_total_size(payload) length of message w/ padding * nlmsg_padlen(payload) length of padding at tail * * Message Payload Access: * nlmsg_data(nlh) head of message payload * nlmsg_len(nlh) length of message payload * nlmsg_attrdata(nlh, hdrlen) head of attributes data * nlmsg_attrlen(nlh, hdrlen) length of attributes data * * Message Parsing: * nlmsg_ok(nlh, remaining) does nlh fit into remaining bytes? * nlmsg_next(nlh, remaining) get next netlink message * nlmsg_parse() parse attributes of a message * nlmsg_find_attr() find an attribute in a message * nlmsg_for_each_msg() loop over all messages * nlmsg_validate() validate netlink message incl. attrs * nlmsg_for_each_attr() loop over all attributes * * Misc: * nlmsg_report() report back to application? * * ------------------------------------------------------------------------ * Attributes Interface * ------------------------------------------------------------------------ * * Attribute Format: * <------- nla_total_size(payload) -------> * <---- nla_attr_size(payload) -----> * +----------+- - -+- - - - - - - - - +- - -+-------- - - * | Header | Pad | Payload | Pad | Header * +----------+- - -+- - - - - - - - - +- - -+-------- - - * <- nla_len(nla) -> ^ * nla_data(nla)----^ | * nla_next(nla)-----------------------------' * * Data Structures: * struct nlattr netlink attribute header * * Attribute Construction: * nla_reserve(skb, type, len) reserve room for an attribute * nla_reserve_nohdr(skb, len) reserve room for an attribute w/o hdr * nla_put(skb, type, len, data) add attribute to skb * nla_put_nohdr(skb, len, data) add attribute w/o hdr * nla_append(skb, len, data) append data to skb * * Attribute Construction for Basic Types: * nla_put_u8(skb, type, value) add u8 attribute to skb * nla_put_u16(skb, type, value) add u16 attribute to skb * nla_put_u32(skb, type, value) add u32 attribute to skb * nla_put_u64_64bit(skb, type, * value, padattr) add u64 attribute to skb * nla_put_s8(skb, type, value) add s8 attribute to skb * nla_put_s16(skb, type, value) add s16 attribute to skb * nla_put_s32(skb, type, value) add s32 attribute to skb * nla_put_s64(skb, type, value, * padattr) add s64 attribute to skb * nla_put_string(skb, type, str) add string attribute to skb * nla_put_flag(skb, type) add flag attribute to skb * nla_put_msecs(skb, type, jiffies, * padattr) add msecs attribute to skb * nla_put_in_addr(skb, type, addr) add IPv4 address attribute to skb * nla_put_in6_addr(skb, type, addr) add IPv6 address attribute to skb * * Nested Attributes Construction: * nla_nest_start(skb, type) start a nested attribute * nla_nest_end(skb, nla) finalize a nested attribute * nla_nest_cancel(skb, nla) cancel nested attribute construction * * Attribute Length Calculations: * nla_attr_size(payload) length of attribute w/o padding * nla_total_size(payload) length of attribute w/ padding * nla_padlen(payload) length of padding * * Attribute Payload Access: * nla_data(nla) head of attribute payload * nla_len(nla) length of attribute payload * * Attribute Payload Access for Basic Types: * nla_get_u8(nla) get payload for a u8 attribute * nla_get_u16(nla) get payload for a u16 attribute * nla_get_u32(nla) get payload for a u32 attribute * nla_get_u64(nla) get payload for a u64 attribute * nla_get_s8(nla) get payload for a s8 attribute * nla_get_s16(nla) get payload for a s16 attribute * nla_get_s32(nla) get payload for a s32 attribute * nla_get_s64(nla) get payload for a s64 attribute * nla_get_flag(nla) return 1 if flag is true * nla_get_msecs(nla) get payload for a msecs attribute * * Attribute Misc: * nla_memcpy(dest, nla, count) copy attribute into memory * nla_memcmp(nla, data, size) compare attribute with memory area * nla_strlcpy(dst, nla, size) copy attribute to a sized string * nla_strcmp(nla, str) compare attribute with string * * Attribute Parsing: * nla_ok(nla, remaining) does nla fit into remaining bytes? * nla_next(nla, remaining) get next netlink attribute * nla_validate() validate a stream of attributes * nla_validate_nested() validate a stream of nested attributes * nla_find() find attribute in stream of attributes * nla_find_nested() find attribute in nested attributes * nla_parse() parse and validate stream of attrs * nla_parse_nested() parse nested attributes * nla_for_each_attr() loop over all attributes * nla_for_each_nested() loop over the nested attributes *========================================================================= */ /** * Standard attribute types to specify validation policy */ enum { NLA_UNSPEC, NLA_U8, NLA_U16, NLA_U32, NLA_U64, NLA_STRING, NLA_FLAG, NLA_MSECS, NLA_NESTED, NLA_NESTED_ARRAY, NLA_NUL_STRING, NLA_BINARY, NLA_S8, NLA_S16, NLA_S32, NLA_S64, NLA_BITFIELD32, NLA_REJECT, __NLA_TYPE_MAX, }; #define NLA_TYPE_MAX (__NLA_TYPE_MAX - 1) struct netlink_range_validation { u64 min, max; }; struct netlink_range_validation_signed { s64 min, max; }; enum nla_policy_validation { NLA_VALIDATE_NONE, NLA_VALIDATE_RANGE, NLA_VALIDATE_RANGE_WARN_TOO_LONG, NLA_VALIDATE_MIN, NLA_VALIDATE_MAX, NLA_VALIDATE_MASK, NLA_VALIDATE_RANGE_PTR, NLA_VALIDATE_FUNCTION, }; /** * struct nla_policy - attribute validation policy * @type: Type of attribute or NLA_UNSPEC * @validation_type: type of attribute validation done in addition to * type-specific validation (e.g. range, function call), see * &enum nla_policy_validation * @len: Type specific length of payload * * Policies are defined as arrays of this struct, the array must be * accessible by attribute type up to the highest identifier to be expected. * * Meaning of `len' field: * NLA_STRING Maximum length of string * NLA_NUL_STRING Maximum length of string (excluding NUL) * NLA_FLAG Unused * NLA_BINARY Maximum length of attribute payload * (but see also below with the validation type) * NLA_NESTED, * NLA_NESTED_ARRAY Length verification is done by checking len of * nested header (or empty); len field is used if * nested_policy is also used, for the max attr * number in the nested policy. * NLA_U8, NLA_U16, * NLA_U32, NLA_U64, * NLA_S8, NLA_S16, * NLA_S32, NLA_S64, * NLA_MSECS Leaving the length field zero will verify the * given type fits, using it verifies minimum length * just like "All other" * NLA_BITFIELD32 Unused * NLA_REJECT Unused * All other Minimum length of attribute payload * * Meaning of validation union: * NLA_BITFIELD32 This is a 32-bit bitmap/bitselector attribute and * `bitfield32_valid' is the u32 value of valid flags * NLA_REJECT This attribute is always rejected and `reject_message' * may point to a string to report as the error instead * of the generic one in extended ACK. * NLA_NESTED `nested_policy' to a nested policy to validate, must * also set `len' to the max attribute number. Use the * provided NLA_POLICY_NESTED() macro. * Note that nla_parse() will validate, but of course not * parse, the nested sub-policies. * NLA_NESTED_ARRAY `nested_policy' points to a nested policy to validate, * must also set `len' to the max attribute number. Use * the provided NLA_POLICY_NESTED_ARRAY() macro. * The difference to NLA_NESTED is the structure: * NLA_NESTED has the nested attributes directly inside * while an array has the nested attributes at another * level down and the attribute types directly in the * nesting don't matter. * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64, * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 The `min' and `max' fields are used depending on the * validation_type field, if that is min/max/range then * the min, max or both are used (respectively) to check * the value of the integer attribute. * Note that in the interest of code simplicity and * struct size both limits are s16, so you cannot * enforce a range that doesn't fall within the range * of s16 - do that as usual in the code instead. * Use the NLA_POLICY_MIN(), NLA_POLICY_MAX() and * NLA_POLICY_RANGE() macros. * NLA_U8, * NLA_U16, * NLA_U32, * NLA_U64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range' must be a pointer * to a struct netlink_range_validation that indicates * the min/max values. * Use NLA_POLICY_FULL_RANGE(). * NLA_S8, * NLA_S16, * NLA_S32, * NLA_S64 If the validation_type field instead is set to * NLA_VALIDATE_RANGE_PTR, `range_signed' must be a * pointer to a struct netlink_range_validation_signed * that indicates the min/max values. * Use NLA_POLICY_FULL_RANGE_SIGNED(). * * NLA_BINARY If the validation type is like the ones for integers * above, then the min/max length (not value like for * integers) of the attribute is enforced. * * All other Unused - but note that it's a union * * Meaning of `validate' field, use via NLA_POLICY_VALIDATE_FN: * NLA_BINARY Validation function called for the attribute. * All other Unused - but note that it's a union * * Example: * * static const u32 myvalidflags = 0xff231023; * * static const struct nla_policy my_policy[ATTR_MAX+1] = { * [ATTR_FOO] = { .type = NLA_U16 }, * [ATTR_BAR] = { .type = NLA_STRING, .len = BARSIZ }, * [ATTR_BAZ] = NLA_POLICY_EXACT_LEN(sizeof(struct mystruct)), * [ATTR_GOO] = NLA_POLICY_BITFIELD32(myvalidflags), * }; */ struct nla_policy { u8 type; u8 validation_type; u16 len; union { const u32 bitfield32_valid; const u32 mask; const char *reject_message; const struct nla_policy *nested_policy; struct netlink_range_validation *range; struct netlink_range_validation_signed *range_signed; struct { s16 min, max; }; int (*validate)(const struct nlattr *attr, struct netlink_ext_ack *extack); /* This entry is special, and used for the attribute at index 0 * only, and specifies special data about the policy, namely it * specifies the "boundary type" where strict length validation * starts for any attribute types >= this value, also, strict * nesting validation starts here. * * Additionally, it means that NLA_UNSPEC is actually NLA_REJECT * for any types >= this, so need to use NLA_POLICY_MIN_LEN() to * get the previous pure { .len = xyz } behaviour. The advantage * of this is that types not specified in the policy will be * rejected. * * For completely new families it should be set to 1 so that the * validation is enforced for all attributes. For existing ones * it should be set at least when new attributes are added to * the enum used by the policy, and be set to the new value that * was added to enforce strict validation from thereon. */ u16 strict_start_type; }; }; #define NLA_POLICY_ETH_ADDR NLA_POLICY_EXACT_LEN(ETH_ALEN) #define NLA_POLICY_ETH_ADDR_COMPAT NLA_POLICY_EXACT_LEN_WARN(ETH_ALEN) #define _NLA_POLICY_NESTED(maxattr, policy) \ { .type = NLA_NESTED, .nested_policy = policy, .len = maxattr } #define _NLA_POLICY_NESTED_ARRAY(maxattr, policy) \ { .type = NLA_NESTED_ARRAY, .nested_policy = policy, .len = maxattr } #define NLA_POLICY_NESTED(policy) \ _NLA_POLICY_NESTED(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_NESTED_ARRAY(policy) \ _NLA_POLICY_NESTED_ARRAY(ARRAY_SIZE(policy) - 1, policy) #define NLA_POLICY_BITFIELD32(valid) \ { .type = NLA_BITFIELD32, .bitfield32_valid = valid } #define __NLA_IS_UINT_TYPE(tp) \ (tp == NLA_U8 || tp == NLA_U16 || tp == NLA_U32 || tp == NLA_U64) #define __NLA_IS_SINT_TYPE(tp) \ (tp == NLA_S8 || tp == NLA_S16 || tp == NLA_S32 || tp == NLA_S64) #define __NLA_ENSURE(condition) BUILD_BUG_ON_ZERO(!(condition)) #define NLA_ENSURE_UINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp)) + tp) #define NLA_ENSURE_UINT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_SINT_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_SINT_TYPE(tp)) + tp) #define NLA_ENSURE_INT_OR_BINARY_TYPE(tp) \ (__NLA_ENSURE(__NLA_IS_UINT_TYPE(tp) || \ __NLA_IS_SINT_TYPE(tp) || \ tp == NLA_MSECS || \ tp == NLA_BINARY) + tp) #define NLA_ENSURE_NO_VALIDATION_PTR(tp) \ (__NLA_ENSURE(tp != NLA_BITFIELD32 && \ tp != NLA_REJECT && \ tp != NLA_NESTED && \ tp != NLA_NESTED_ARRAY) + tp) #define NLA_POLICY_RANGE(tp, _min, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE, \ .min = _min, \ .max = _max \ } #define NLA_POLICY_FULL_RANGE(tp, _range) { \ .type = NLA_ENSURE_UINT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range = _range, \ } #define NLA_POLICY_FULL_RANGE_SIGNED(tp, _range) { \ .type = NLA_ENSURE_SINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_RANGE_PTR, \ .range_signed = _range, \ } #define NLA_POLICY_MIN(tp, _min) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MIN, \ .min = _min, \ } #define NLA_POLICY_MAX(tp, _max) { \ .type = NLA_ENSURE_INT_OR_BINARY_TYPE(tp), \ .validation_type = NLA_VALIDATE_MAX, \ .max = _max, \ } #define NLA_POLICY_MASK(tp, _mask) { \ .type = NLA_ENSURE_UINT_TYPE(tp), \ .validation_type = NLA_VALIDATE_MASK, \ .mask = _mask, \ } #define NLA_POLICY_VALIDATE_FN(tp, fn, ...) { \ .type = NLA_ENSURE_NO_VALIDATION_PTR(tp), \ .validation_type = NLA_VALIDATE_FUNCTION, \ .validate = fn, \ .len = __VA_ARGS__ + 0, \ } #define NLA_POLICY_EXACT_LEN(_len) NLA_POLICY_RANGE(NLA_BINARY, _len, _len) #define NLA_POLICY_EXACT_LEN_WARN(_len) { \ .type = NLA_BINARY, \ .validation_type = NLA_VALIDATE_RANGE_WARN_TOO_LONG, \ .min = _len, \ .max = _len \ } #define NLA_POLICY_MIN_LEN(_len) NLA_POLICY_MIN(NLA_BINARY, _len) /** * struct nl_info - netlink source information * @nlh: Netlink message header of original request * @nl_net: Network namespace * @portid: Netlink PORTID of requesting application * @skip_notify: Skip netlink notifications to user space * @skip_notify_kernel: Skip selected in-kernel notifications */ struct nl_info { struct nlmsghdr *nlh; struct net *nl_net; u32 portid; u8 skip_notify:1, skip_notify_kernel:1; }; /** * enum netlink_validation - netlink message/attribute validation levels * @NL_VALIDATE_LIBERAL: Old-style "be liberal" validation, not caring about * extra data at the end of the message, attributes being longer than * they should be, or unknown attributes being present. * @NL_VALIDATE_TRAILING: Reject junk data encountered after attribute parsing. * @NL_VALIDATE_MAXTYPE: Reject attributes > max type; Together with _TRAILING * this is equivalent to the old nla_parse_strict()/nlmsg_parse_strict(). * @NL_VALIDATE_UNSPEC: Reject attributes with NLA_UNSPEC in the policy. * This can safely be set by the kernel when the given policy has no * NLA_UNSPEC anymore, and can thus be used to ensure policy entries * are enforced going forward. * @NL_VALIDATE_STRICT_ATTRS: strict attribute policy parsing (e.g. * U8, U16, U32 must have exact size, etc.) * @NL_VALIDATE_NESTED: Check that NLA_F_NESTED is set for NLA_NESTED(_ARRAY) * and unset for other policies. */ enum netlink_validation { NL_VALIDATE_LIBERAL = 0, NL_VALIDATE_TRAILING = BIT(0), NL_VALIDATE_MAXTYPE = BIT(1), NL_VALIDATE_UNSPEC = BIT(2), NL_VALIDATE_STRICT_ATTRS = BIT(3), NL_VALIDATE_NESTED = BIT(4), }; #define NL_VALIDATE_DEPRECATED_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE) #define NL_VALIDATE_STRICT (NL_VALIDATE_TRAILING |\ NL_VALIDATE_MAXTYPE |\ NL_VALIDATE_UNSPEC |\ NL_VALIDATE_STRICT_ATTRS |\ NL_VALIDATE_NESTED) int netlink_rcv_skb(struct sk_buff *skb, int (*cb)(struct sk_buff *, struct nlmsghdr *, struct netlink_ext_ack *)); int nlmsg_notify(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, int report, gfp_t flags); int __nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int __nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack); int nla_policy_len(const struct nla_policy *, int); struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype); size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize); char *nla_strdup(const struct nlattr *nla, gfp_t flags); int nla_memcpy(void *dest, const struct nlattr *src, int count); int nla_memcmp(const struct nlattr *nla, const void *data, size_t size); int nla_strcmp(const struct nlattr *nla, const char *str); struct nlattr *__nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *__nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *__nla_reserve_nohdr(struct sk_buff *skb, int attrlen); struct nlattr *nla_reserve(struct sk_buff *skb, int attrtype, int attrlen); struct nlattr *nla_reserve_64bit(struct sk_buff *skb, int attrtype, int attrlen, int padattr); void *nla_reserve_nohdr(struct sk_buff *skb, int attrlen); void __nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); void __nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); void __nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_put(struct sk_buff *skb, int attrtype, int attrlen, const void *data); int nla_put_64bit(struct sk_buff *skb, int attrtype, int attrlen, const void *data, int padattr); int nla_put_nohdr(struct sk_buff *skb, int attrlen, const void *data); int nla_append(struct sk_buff *skb, int attrlen, const void *data); /************************************************************************** * Netlink Messages **************************************************************************/ /** * nlmsg_msg_size - length of netlink message not including padding * @payload: length of message payload */ static inline int nlmsg_msg_size(int payload) { return NLMSG_HDRLEN + payload; } /** * nlmsg_total_size - length of netlink message including padding * @payload: length of message payload */ static inline int nlmsg_total_size(int payload) { return NLMSG_ALIGN(nlmsg_msg_size(payload)); } /** * nlmsg_padlen - length of padding at the message's tail * @payload: length of message payload */ static inline int nlmsg_padlen(int payload) { return nlmsg_total_size(payload) - nlmsg_msg_size(payload); } /** * nlmsg_data - head of message payload * @nlh: netlink message header */ static inline void *nlmsg_data(const struct nlmsghdr *nlh) { return (unsigned char *) nlh + NLMSG_HDRLEN; } /** * nlmsg_len - length of message payload * @nlh: netlink message header */ static inline int nlmsg_len(const struct nlmsghdr *nlh) { return nlh->nlmsg_len - NLMSG_HDRLEN; } /** * nlmsg_attrdata - head of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline struct nlattr *nlmsg_attrdata(const struct nlmsghdr *nlh, int hdrlen) { unsigned char *data = nlmsg_data(nlh); return (struct nlattr *) (data + NLMSG_ALIGN(hdrlen)); } /** * nlmsg_attrlen - length of attributes data * @nlh: netlink message header * @hdrlen: length of family specific header */ static inline int nlmsg_attrlen(const struct nlmsghdr *nlh, int hdrlen) { return nlmsg_len(nlh) - NLMSG_ALIGN(hdrlen); } /** * nlmsg_ok - check if the netlink message fits into the remaining bytes * @nlh: netlink message header * @remaining: number of bytes remaining in message stream */ static inline int nlmsg_ok(const struct nlmsghdr *nlh, int remaining) { return (remaining >= (int) sizeof(struct nlmsghdr) && nlh->nlmsg_len >= sizeof(struct nlmsghdr) && nlh->nlmsg_len <= remaining); } /** * nlmsg_next - next netlink message in message stream * @nlh: netlink message header * @remaining: number of bytes remaining in message stream * * Returns the next netlink message in the message stream and * decrements remaining by the size of the current message. */ static inline struct nlmsghdr * nlmsg_next(const struct nlmsghdr *nlh, int *remaining) { int totlen = NLMSG_ALIGN(nlh->nlmsg_len); *remaining -= totlen; return (struct nlmsghdr *) ((unsigned char *) nlh + totlen); } /** * nla_parse - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected, policy must be specified, attributes * will be validated in the strictest way possible. * * Returns 0 on success or a negative error code. */ static inline int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_deprecated - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be ignored and attributes from the policy are not * always strictly validated (only for new attributes). * * Returns 0 on success or a negative error code. */ static inline int nla_parse_deprecated(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_parse_deprecated_strict - Parse a stream of attributes into a tb buffer * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @head: head of attribute stream * @len: length of attribute stream * @policy: validation policy * @extack: extended ACK pointer * * Parses a stream of attributes and stores a pointer to each attribute in * the tb array accessible via the attribute type. Attributes with a type * exceeding maxtype will be rejected as well as trailing data, but the * policy is not completely strictly validated (only for new attributes). * * Returns 0 on success or a negative error code. */ static inline int nla_parse_deprecated_strict(struct nlattr **tb, int maxtype, const struct nlattr *head, int len, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, head, len, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * __nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * See nla_parse() */ static inline int __nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) { NL_SET_ERR_MSG(extack, "Invalid header length"); return -EINVAL; } return __nla_parse(tb, maxtype, nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), policy, validate, extack); } /** * nlmsg_parse - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @extack: extended ACK report struct * * See nla_parse() */ static inline int nlmsg_parse(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_parse_deprecated - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nlmsg_parse_deprecated(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_parse_deprecated_strict - parse attributes of a netlink message * @nlh: netlink message header * @hdrlen: length of family specific header * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @extack: extended ACK report struct * * See nla_parse_deprecated_strict() */ static inline int nlmsg_parse_deprecated_strict(const struct nlmsghdr *nlh, int hdrlen, struct nlattr *tb[], int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nlmsg_parse(nlh, hdrlen, tb, maxtype, policy, NL_VALIDATE_DEPRECATED_STRICT, extack); } /** * nlmsg_find_attr - find a specific attribute in a netlink message * @nlh: netlink message header * @hdrlen: length of familiy specific header * @attrtype: type of attribute to look for * * Returns the first attribute which matches the specified type. */ static inline struct nlattr *nlmsg_find_attr(const struct nlmsghdr *nlh, int hdrlen, int attrtype) { return nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), attrtype); } /** * nla_validate_deprecated - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in liberal mode. * See documenation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ static inline int nla_validate_deprecated(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_validate - Validate a stream of attributes * @head: head of attribute stream * @len: length of attribute stream * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct * * Validates all attributes in the specified attribute stream against the * specified policy. Validation is done in strict mode. * See documenation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ static inline int nla_validate(const struct nlattr *head, int len, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate(head, len, maxtype, policy, NL_VALIDATE_STRICT, extack); } /** * nlmsg_validate_deprecated - validate a netlink message including attributes * @nlh: netlinket message header * @hdrlen: length of familiy specific header * @maxtype: maximum attribute type to be expected * @policy: validation policy * @extack: extended ACK report struct */ static inline int nlmsg_validate_deprecated(const struct nlmsghdr *nlh, int hdrlen, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; return __nla_validate(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nlmsg_report - need to report back to application? * @nlh: netlink message header * * Returns 1 if a report back to the application is requested. */ static inline int nlmsg_report(const struct nlmsghdr *nlh) { return !!(nlh->nlmsg_flags & NLM_F_ECHO); } /** * nlmsg_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @nlh: netlink message header * @hdrlen: length of familiy specific header * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_attr(pos, nlh, hdrlen, rem) \ nla_for_each_attr(pos, nlmsg_attrdata(nlh, hdrlen), \ nlmsg_attrlen(nlh, hdrlen), rem) /** * nlmsg_put - Add a new netlink message to an skb * @skb: socket buffer to store message in * @portid: netlink PORTID of requesting application * @seq: sequence number of message * @type: message type * @payload: length of message payload * @flags: message flags * * Returns NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int payload, int flags) { if (unlikely(skb_tailroom(skb) < nlmsg_total_size(payload))) return NULL; return __nlmsg_put(skb, portid, seq, type, payload, flags); } /** * nlmsg_put_answer - Add a new callback based netlink message to an skb * @skb: socket buffer to store message in * @cb: netlink callback * @type: message type * @payload: length of message payload * @flags: message flags * * Returns NULL if the tailroom of the skb is insufficient to store * the message header and payload. */ static inline struct nlmsghdr *nlmsg_put_answer(struct sk_buff *skb, struct netlink_callback *cb, int type, int payload, int flags) { return nlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, type, payload, flags); } /** * nlmsg_new - Allocate a new netlink message * @payload: size of the message payload * @flags: the type of memory to allocate. * * Use NLMSG_DEFAULT_SIZE if the size of the payload isn't known * and a good default is needed. */ static inline struct sk_buff *nlmsg_new(size_t payload, gfp_t flags) { return alloc_skb(nlmsg_total_size(payload), flags); } /** * nlmsg_end - Finalize a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Corrects the netlink message header to include the appeneded * attributes. Only necessary if attributes have been added to * the message. */ static inline void nlmsg_end(struct sk_buff *skb, struct nlmsghdr *nlh) { nlh->nlmsg_len = skb_tail_pointer(skb) - (unsigned char *)nlh; } /** * nlmsg_get_pos - return current position in netlink message * @skb: socket buffer the message is stored in * * Returns a pointer to the current tail of the message. */ static inline void *nlmsg_get_pos(struct sk_buff *skb) { return skb_tail_pointer(skb); } /** * nlmsg_trim - Trim message to a mark * @skb: socket buffer the message is stored in * @mark: mark to trim to * * Trims the message to the provided mark. */ static inline void nlmsg_trim(struct sk_buff *skb, const void *mark) { if (mark) { WARN_ON((unsigned char *) mark < skb->data); skb_trim(skb, (unsigned char *) mark - skb->data); } } /** * nlmsg_cancel - Cancel construction of a netlink message * @skb: socket buffer the message is stored in * @nlh: netlink message header * * Removes the complete netlink message including all * attributes from the socket buffer again. */ static inline void nlmsg_cancel(struct sk_buff *skb, struct nlmsghdr *nlh) { nlmsg_trim(skb, nlh); } /** * nlmsg_free - free a netlink message * @skb: socket buffer of netlink message */ static inline void nlmsg_free(struct sk_buff *skb) { kfree_skb(skb); } /** * nlmsg_multicast - multicast a netlink message * @sk: netlink socket to spread messages to * @skb: netlink message as socket buffer * @portid: own netlink portid to avoid sending to yourself * @group: multicast group id * @flags: allocation flags */ static inline int nlmsg_multicast(struct sock *sk, struct sk_buff *skb, u32 portid, unsigned int group, gfp_t flags) { int err; NETLINK_CB(skb).dst_group = group; err = netlink_broadcast(sk, skb, portid, group, flags); if (err > 0) err = 0; return err; } /** * nlmsg_unicast - unicast a netlink message * @sk: netlink socket to spread message to * @skb: netlink message as socket buffer * @portid: netlink portid of the destination socket */ static inline int nlmsg_unicast(struct sock *sk, struct sk_buff *skb, u32 portid) { int err; err = netlink_unicast(sk, skb, portid, MSG_DONTWAIT); if (err > 0) err = 0; return err; } /** * nlmsg_for_each_msg - iterate over a stream of messages * @pos: loop counter, set to current message * @head: head of message stream * @len: length of message stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nlmsg_for_each_msg(pos, head, len, rem) \ for (pos = head, rem = len; \ nlmsg_ok(pos, rem); \ pos = nlmsg_next(pos, &(rem))) /** * nl_dump_check_consistent - check if sequence is consistent and advertise if not * @cb: netlink callback structure that stores the sequence number * @nlh: netlink message header to write the flag to * * This function checks if the sequence (generation) number changed during dump * and if it did, advertises it in the netlink message header. * * The correct way to use it is to set cb->seq to the generation counter when * all locks for dumping have been acquired, and then call this function for * each message that is generated. * * Note that due to initialisation concerns, 0 is an invalid sequence number * and must not be used by code that uses this functionality. */ static inline void nl_dump_check_consistent(struct netlink_callback *cb, struct nlmsghdr *nlh) { if (cb->prev_seq && cb->seq != cb->prev_seq) nlh->nlmsg_flags |= NLM_F_DUMP_INTR; cb->prev_seq = cb->seq; } /************************************************************************** * Netlink Attributes **************************************************************************/ /** * nla_attr_size - length of attribute not including padding * @payload: length of payload */ static inline int nla_attr_size(int payload) { return NLA_HDRLEN + payload; } /** * nla_total_size - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size(int payload) { return NLA_ALIGN(nla_attr_size(payload)); } /** * nla_padlen - length of padding at the tail of attribute * @payload: length of payload */ static inline int nla_padlen(int payload) { return nla_total_size(payload) - nla_attr_size(payload); } /** * nla_type - attribute type * @nla: netlink attribute */ static inline int nla_type(const struct nlattr *nla) { return nla->nla_type & NLA_TYPE_MASK; } /** * nla_data - head of payload * @nla: netlink attribute */ static inline void *nla_data(const struct nlattr *nla) { return (char *) nla + NLA_HDRLEN; } /** * nla_len - length of payload * @nla: netlink attribute */ static inline int nla_len(const struct nlattr *nla) { return nla->nla_len - NLA_HDRLEN; } /** * nla_ok - check if the netlink attribute fits into the remaining bytes * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream */ static inline int nla_ok(const struct nlattr *nla, int remaining) { return remaining >= (int) sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } /** * nla_next - next netlink attribute in attribute stream * @nla: netlink attribute * @remaining: number of bytes remaining in attribute stream * * Returns the next netlink attribute in the attribute stream and * decrements remaining by the size of the current attribute. */ static inline struct nlattr *nla_next(const struct nlattr *nla, int *remaining) { unsigned int totlen = NLA_ALIGN(nla->nla_len); *remaining -= totlen; return (struct nlattr *) ((char *) nla + totlen); } /** * nla_find_nested - find attribute in a set of nested attributes * @nla: attribute containing the nested attributes * @attrtype: type of attribute to look for * * Returns the first attribute which matches the specified type. */ static inline struct nlattr * nla_find_nested(const struct nlattr *nla, int attrtype) { return nla_find(nla_data(nla), nla_len(nla), attrtype); } /** * nla_parse_nested - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse() */ static inline int nla_parse_nested(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { if (!(nla->nla_type & NLA_F_NESTED)) { NL_SET_ERR_MSG_ATTR(extack, nla, "NLA_F_NESTED is missing"); return -EINVAL; } return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_STRICT, extack); } /** * nla_parse_nested_deprecated - parse nested attributes * @tb: destination array with maxtype+1 elements * @maxtype: maximum attribute type to be expected * @nla: attribute containing the nested attributes * @policy: validation policy * @extack: extended ACK report struct * * See nla_parse_deprecated() */ static inline int nla_parse_nested_deprecated(struct nlattr *tb[], int maxtype, const struct nlattr *nla, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_parse(tb, maxtype, nla_data(nla), nla_len(nla), policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_put_u8 - Add a u8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u8(struct sk_buff *skb, int attrtype, u8 value) { /* temporary variables to work around GCC PR81715 with asan-stack=1 */ u8 tmp = value; return nla_put(skb, attrtype, sizeof(u8), &tmp); } /** * nla_put_u16 - Add a u16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u16(struct sk_buff *skb, int attrtype, u16 value) { u16 tmp = value; return nla_put(skb, attrtype, sizeof(u16), &tmp); } /** * nla_put_be16 - Add a __be16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put(skb, attrtype, sizeof(__be16), &tmp); } /** * nla_put_net16 - Add 16-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net16(struct sk_buff *skb, int attrtype, __be16 value) { __be16 tmp = value; return nla_put_be16(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le16 - Add a __le16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le16(struct sk_buff *skb, int attrtype, __le16 value) { __le16 tmp = value; return nla_put(skb, attrtype, sizeof(__le16), &tmp); } /** * nla_put_u32 - Add a u32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_u32(struct sk_buff *skb, int attrtype, u32 value) { u32 tmp = value; return nla_put(skb, attrtype, sizeof(u32), &tmp); } /** * nla_put_be32 - Add a __be32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_be32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put(skb, attrtype, sizeof(__be32), &tmp); } /** * nla_put_net32 - Add 32-bit network byte order netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_net32(struct sk_buff *skb, int attrtype, __be32 value) { __be32 tmp = value; return nla_put_be32(skb, attrtype | NLA_F_NET_BYTEORDER, tmp); } /** * nla_put_le32 - Add a __le32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_le32(struct sk_buff *skb, int attrtype, __le32 value) { __le32 tmp = value; return nla_put(skb, attrtype, sizeof(__le32), &tmp); } /** * nla_put_u64_64bit - Add a u64 netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_u64_64bit(struct sk_buff *skb, int attrtype, u64 value, int padattr) { u64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_be64 - Add a __be64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_be64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__be64), &tmp, padattr); } /** * nla_put_net64 - Add 64-bit network byte order nlattr to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_net64(struct sk_buff *skb, int attrtype, __be64 value, int padattr) { __be64 tmp = value; return nla_put_be64(skb, attrtype | NLA_F_NET_BYTEORDER, tmp, padattr); } /** * nla_put_le64 - Add a __le64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_le64(struct sk_buff *skb, int attrtype, __le64 value, int padattr) { __le64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(__le64), &tmp, padattr); } /** * nla_put_s8 - Add a s8 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s8(struct sk_buff *skb, int attrtype, s8 value) { s8 tmp = value; return nla_put(skb, attrtype, sizeof(s8), &tmp); } /** * nla_put_s16 - Add a s16 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s16(struct sk_buff *skb, int attrtype, s16 value) { s16 tmp = value; return nla_put(skb, attrtype, sizeof(s16), &tmp); } /** * nla_put_s32 - Add a s32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value */ static inline int nla_put_s32(struct sk_buff *skb, int attrtype, s32 value) { s32 tmp = value; return nla_put(skb, attrtype, sizeof(s32), &tmp); } /** * nla_put_s64 - Add a s64 netlink attribute to a socket buffer and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: numeric value * @padattr: attribute type for the padding */ static inline int nla_put_s64(struct sk_buff *skb, int attrtype, s64 value, int padattr) { s64 tmp = value; return nla_put_64bit(skb, attrtype, sizeof(s64), &tmp, padattr); } /** * nla_put_string - Add a string netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @str: NUL terminated string */ static inline int nla_put_string(struct sk_buff *skb, int attrtype, const char *str) { return nla_put(skb, attrtype, strlen(str) + 1, str); } /** * nla_put_flag - Add a flag netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type */ static inline int nla_put_flag(struct sk_buff *skb, int attrtype) { return nla_put(skb, attrtype, 0, NULL); } /** * nla_put_msecs - Add a msecs netlink attribute to a skb and align it * @skb: socket buffer to add attribute to * @attrtype: attribute type * @njiffies: number of jiffies to convert to msecs * @padattr: attribute type for the padding */ static inline int nla_put_msecs(struct sk_buff *skb, int attrtype, unsigned long njiffies, int padattr) { u64 tmp = jiffies_to_msecs(njiffies); return nla_put_64bit(skb, attrtype, sizeof(u64), &tmp, padattr); } /** * nla_put_in_addr - Add an IPv4 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv4 address */ static inline int nla_put_in_addr(struct sk_buff *skb, int attrtype, __be32 addr) { __be32 tmp = addr; return nla_put_be32(skb, attrtype, tmp); } /** * nla_put_in6_addr - Add an IPv6 address netlink attribute to a socket * buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @addr: IPv6 address */ static inline int nla_put_in6_addr(struct sk_buff *skb, int attrtype, const struct in6_addr *addr) { return nla_put(skb, attrtype, sizeof(*addr), addr); } /** * nla_put_bitfield32 - Add a bitfield32 netlink attribute to a socket buffer * @skb: socket buffer to add attribute to * @attrtype: attribute type * @value: value carrying bits * @selector: selector of valid bits */ static inline int nla_put_bitfield32(struct sk_buff *skb, int attrtype, __u32 value, __u32 selector) { struct nla_bitfield32 tmp = { value, selector, }; return nla_put(skb, attrtype, sizeof(tmp), &tmp); } /** * nla_get_u32 - return payload of u32 attribute * @nla: u32 netlink attribute */ static inline u32 nla_get_u32(const struct nlattr *nla) { return *(u32 *) nla_data(nla); } /** * nla_get_be32 - return payload of __be32 attribute * @nla: __be32 netlink attribute */ static inline __be32 nla_get_be32(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_le32 - return payload of __le32 attribute * @nla: __le32 netlink attribute */ static inline __le32 nla_get_le32(const struct nlattr *nla) { return *(__le32 *) nla_data(nla); } /** * nla_get_u16 - return payload of u16 attribute * @nla: u16 netlink attribute */ static inline u16 nla_get_u16(const struct nlattr *nla) { return *(u16 *) nla_data(nla); } /** * nla_get_be16 - return payload of __be16 attribute * @nla: __be16 netlink attribute */ static inline __be16 nla_get_be16(const struct nlattr *nla) { return *(__be16 *) nla_data(nla); } /** * nla_get_le16 - return payload of __le16 attribute * @nla: __le16 netlink attribute */ static inline __le16 nla_get_le16(const struct nlattr *nla) { return *(__le16 *) nla_data(nla); } /** * nla_get_u8 - return payload of u8 attribute * @nla: u8 netlink attribute */ static inline u8 nla_get_u8(const struct nlattr *nla) { return *(u8 *) nla_data(nla); } /** * nla_get_u64 - return payload of u64 attribute * @nla: u64 netlink attribute */ static inline u64 nla_get_u64(const struct nlattr *nla) { u64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_be64 - return payload of __be64 attribute * @nla: __be64 netlink attribute */ static inline __be64 nla_get_be64(const struct nlattr *nla) { __be64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_le64 - return payload of __le64 attribute * @nla: __le64 netlink attribute */ static inline __le64 nla_get_le64(const struct nlattr *nla) { return *(__le64 *) nla_data(nla); } /** * nla_get_s32 - return payload of s32 attribute * @nla: s32 netlink attribute */ static inline s32 nla_get_s32(const struct nlattr *nla) { return *(s32 *) nla_data(nla); } /** * nla_get_s16 - return payload of s16 attribute * @nla: s16 netlink attribute */ static inline s16 nla_get_s16(const struct nlattr *nla) { return *(s16 *) nla_data(nla); } /** * nla_get_s8 - return payload of s8 attribute * @nla: s8 netlink attribute */ static inline s8 nla_get_s8(const struct nlattr *nla) { return *(s8 *) nla_data(nla); } /** * nla_get_s64 - return payload of s64 attribute * @nla: s64 netlink attribute */ static inline s64 nla_get_s64(const struct nlattr *nla) { s64 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_flag - return payload of flag attribute * @nla: flag netlink attribute */ static inline int nla_get_flag(const struct nlattr *nla) { return !!nla; } /** * nla_get_msecs - return payload of msecs attribute * @nla: msecs netlink attribute * * Returns the number of milliseconds in jiffies. */ static inline unsigned long nla_get_msecs(const struct nlattr *nla) { u64 msecs = nla_get_u64(nla); return msecs_to_jiffies((unsigned long) msecs); } /** * nla_get_in_addr - return payload of IPv4 address attribute * @nla: IPv4 address netlink attribute */ static inline __be32 nla_get_in_addr(const struct nlattr *nla) { return *(__be32 *) nla_data(nla); } /** * nla_get_in6_addr - return payload of IPv6 address attribute * @nla: IPv6 address netlink attribute */ static inline struct in6_addr nla_get_in6_addr(const struct nlattr *nla) { struct in6_addr tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_get_bitfield32 - return payload of 32 bitfield attribute * @nla: nla_bitfield32 attribute */ static inline struct nla_bitfield32 nla_get_bitfield32(const struct nlattr *nla) { struct nla_bitfield32 tmp; nla_memcpy(&tmp, nla, sizeof(tmp)); return tmp; } /** * nla_memdup - duplicate attribute memory (kmemdup) * @src: netlink attribute to duplicate from * @gfp: GFP mask */ static inline void *nla_memdup(const struct nlattr *src, gfp_t gfp) { return kmemdup(nla_data(src), nla_len(src), gfp); } /** * nla_nest_start_noflag - Start a new level of nested attributes * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * This function exists for backward compatibility to use in APIs which never * marked their nest attributes with NLA_F_NESTED flag. New APIs should use * nla_nest_start() which sets the flag. * * Returns the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start_noflag(struct sk_buff *skb, int attrtype) { struct nlattr *start = (struct nlattr *)skb_tail_pointer(skb); if (nla_put(skb, attrtype, 0, NULL) < 0) return NULL; return start; } /** * nla_nest_start - Start a new level of nested attributes, with NLA_F_NESTED * @skb: socket buffer to add attributes to * @attrtype: attribute type of container * * Unlike nla_nest_start_noflag(), mark the nest attribute with NLA_F_NESTED * flag. This is the preferred function to use in new code. * * Returns the container attribute or NULL on error */ static inline struct nlattr *nla_nest_start(struct sk_buff *skb, int attrtype) { return nla_nest_start_noflag(skb, attrtype | NLA_F_NESTED); } /** * nla_nest_end - Finalize nesting of attributes * @skb: socket buffer the attributes are stored in * @start: container attribute * * Corrects the container attribute header to include the all * appeneded attributes. * * Returns the total data length of the skb. */ static inline int nla_nest_end(struct sk_buff *skb, struct nlattr *start) { start->nla_len = skb_tail_pointer(skb) - (unsigned char *)start; return skb->len; } /** * nla_nest_cancel - Cancel nesting of attributes * @skb: socket buffer the message is stored in * @start: container attribute * * Removes the container attribute and including all nested * attributes. Returns -EMSGSIZE */ static inline void nla_nest_cancel(struct sk_buff *skb, struct nlattr *start) { nlmsg_trim(skb, start); } /** * __nla_validate_nested - Validate a stream of nested attributes * @start: container attribute * @maxtype: maximum attribute type to be expected * @policy: validation policy * @validate: validation strictness * @extack: extended ACK report struct * * Validates all attributes in the nested attribute stream against the * specified policy. Attributes with a type exceeding maxtype will be * ignored. See documenation of struct nla_policy for more details. * * Returns 0 on success or a negative error code. */ static inline int __nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, unsigned int validate, struct netlink_ext_ack *extack) { return __nla_validate(nla_data(start), nla_len(start), maxtype, policy, validate, extack); } static inline int nla_validate_nested(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_STRICT, extack); } static inline int nla_validate_nested_deprecated(const struct nlattr *start, int maxtype, const struct nla_policy *policy, struct netlink_ext_ack *extack) { return __nla_validate_nested(start, maxtype, policy, NL_VALIDATE_LIBERAL, extack); } /** * nla_need_padding_for_64bit - test 64-bit alignment of the next attribute * @skb: socket buffer the message is stored in * * Return true if padding is needed to align the next attribute (nla_data()) to * a 64-bit aligned area. */ static inline bool nla_need_padding_for_64bit(struct sk_buff *skb) { #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS /* The nlattr header is 4 bytes in size, that's why we test * if the skb->data _is_ aligned. A NOP attribute, plus * nlattr header for next attribute, will make nla_data() * 8-byte aligned. */ if (IS_ALIGNED((unsigned long)skb_tail_pointer(skb), 8)) return true; #endif return false; } /** * nla_align_64bit - 64-bit align the nla_data() of next attribute * @skb: socket buffer the message is stored in * @padattr: attribute type for the padding * * Conditionally emit a padding netlink attribute in order to make * the next attribute we emit have a 64-bit aligned nla_data() area. * This will only be done in architectures which do not have * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS defined. * * Returns zero on success or a negative error code. */ static inline int nla_align_64bit(struct sk_buff *skb, int padattr) { if (nla_need_padding_for_64bit(skb) && !nla_reserve(skb, padattr, 0)) return -EMSGSIZE; return 0; } /** * nla_total_size_64bit - total length of attribute including padding * @payload: length of payload */ static inline int nla_total_size_64bit(int payload) { return NLA_ALIGN(nla_attr_size(payload)) #ifndef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS + NLA_ALIGN(nla_attr_size(0)) #endif ; } /** * nla_for_each_attr - iterate over a stream of attributes * @pos: loop counter, set to current attribute * @head: head of attribute stream * @len: length of attribute stream * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_attr(pos, head, len, rem) \ for (pos = head, rem = len; \ nla_ok(pos, rem); \ pos = nla_next(pos, &(rem))) /** * nla_for_each_nested - iterate over nested attributes * @pos: loop counter, set to current attribute * @nla: attribute containing the nested attributes * @rem: initialized to len, holds bytes currently remaining in stream */ #define nla_for_each_nested(pos, nla, rem) \ nla_for_each_attr(pos, nla_data(nla), nla_len(nla), rem) /** * nla_is_last - Test if attribute is last in stream * @nla: attribute to test * @rem: bytes remaining in stream */ static inline bool nla_is_last(const struct nlattr *nla, int rem) { return nla->nla_len == rem; } void nla_get_range_unsigned(const struct nla_policy *pt, struct netlink_range_validation *range); void nla_get_range_signed(const struct nla_policy *pt, struct netlink_range_validation_signed *range); struct netlink_policy_dump_state; int netlink_policy_dump_add_policy(struct netlink_policy_dump_state **pstate, const struct nla_policy *policy, unsigned int maxtype); int netlink_policy_dump_get_policy_idx(struct netlink_policy_dump_state *state, const struct nla_policy *policy, unsigned int maxtype); bool netlink_policy_dump_loop(struct netlink_policy_dump_state *state); int netlink_policy_dump_write(struct sk_buff *skb, struct netlink_policy_dump_state *state); int netlink_policy_dump_attr_size_estimate(const struct nla_policy *pt); int netlink_policy_dump_write_attr(struct sk_buff *skb, const struct nla_policy *pt, int nestattr); void netlink_policy_dump_free(struct netlink_policy_dump_state *state); #endif
5 10 9 10 10 3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 /* SPDX-License-Identifier: GPL-2.0 */ /* * This header provides generic wrappers for memory access instrumentation that * the compiler cannot emit for: KASAN, KCSAN. */ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H #include <linux/compiler.h> #include <linux/kasan-checks.h> #include <linux/kcsan-checks.h> #include <linux/types.h> /** * instrument_read - instrument regular read access * * Instrument a regular read access. The instrumentation should be inserted * before the actual read happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_read(v, size); } /** * instrument_write - instrument regular write access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_write(v, size); } /** * instrument_read_write - instrument regular read-write access * * Instrument a regular write access. The instrumentation should be inserted * before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_read_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_read_write(v, size); } /** * instrument_atomic_read - instrument atomic read access * * Instrument an atomic read access. The instrumentation should be inserted * before the actual read happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_atomic_read(const volatile void *v, size_t size) { kasan_check_read(v, size); kcsan_check_atomic_read(v, size); } /** * instrument_atomic_write - instrument atomic write access * * Instrument an atomic write access. The instrumentation should be inserted * before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_atomic_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_write(v, size); } /** * instrument_atomic_read_write - instrument atomic read-write access * * Instrument an atomic read-write access. The instrumentation should be * inserted before the actual write happens. * * @ptr address of access * @size size of access */ static __always_inline void instrument_atomic_read_write(const volatile void *v, size_t size) { kasan_check_write(v, size); kcsan_check_atomic_read_write(v, size); } /** * instrument_copy_to_user - instrument reads of copy_to_user * * Instrument reads from kernel memory, that are due to copy_to_user (and * variants). The instrumentation must be inserted before the accesses. * * @to destination address * @from source address * @n number of bytes to copy */ static __always_inline void instrument_copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); kcsan_check_read(from, n); } /** * instrument_copy_from_user - instrument writes of copy_from_user * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. * * @to destination address * @from source address * @n number of bytes to copy */ static __always_inline void instrument_copy_from_user(const void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); kcsan_check_write(to, n); } #endif /* _LINUX_INSTRUMENTED_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef _ASM_X86_PROCESSOR_H #define _ASM_X86_PROCESSOR_H #include <asm/processor-flags.h> /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; struct io_bitmap; struct vm86; #include <asm/math_emu.h> #include <asm/segment.h> #include <asm/types.h> #include <uapi/asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeatures.h> #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> #include <asm/msr.h> #include <asm/desc_defs.h> #include <asm/nops.h> #include <asm/special_insns.h> #include <asm/fpu/types.h> #include <asm/unwind_hints.h> #include <asm/vmxfeatures.h> #include <asm/vdso/processor.h> #include <linux/personality.h> #include <linux/cache.h> #include <linux/threads.h> #include <linux/math64.h> #include <linux/err.h> #include <linux/irqflags.h> #include <linux/mem_encrypt.h> /* * We handle most unaligned accesses in hardware. On the other hand * unaligned DMA can be quite expensive on some Nehalem processors. * * Based on this we disable the IP header alignment in network drivers. */ #define NET_IP_ALIGN 0 #define HBP_NUM 4 /* * These alignment constraints are for performance in the vSMP case, * but in the task_struct case we must also meet hardware imposed * alignment requirements of the FPU state: */ #ifdef CONFIG_X86_VSMP # define ARCH_MIN_TASKALIGN (1 << INTERNODE_CACHE_SHIFT) # define ARCH_MIN_MMSTRUCT_ALIGN (1 << INTERNODE_CACHE_SHIFT) #else # define ARCH_MIN_TASKALIGN __alignof__(union fpregs_state) # define ARCH_MIN_MMSTRUCT_ALIGN 0 #endif enum tlb_infos { ENTRIES, NR_INFO }; extern u16 __read_mostly tlb_lli_4k[NR_INFO]; extern u16 __read_mostly tlb_lli_2m[NR_INFO]; extern u16 __read_mostly tlb_lli_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_4k[NR_INFO]; extern u16 __read_mostly tlb_lld_2m[NR_INFO]; extern u16 __read_mostly tlb_lld_4m[NR_INFO]; extern u16 __read_mostly tlb_lld_1g[NR_INFO]; /* * CPU type and hardware bug flags. Kept separately for each CPU. * Members of this structure are referenced in head_32.S, so think twice * before touching them. [mj] */ struct cpuinfo_x86 { __u8 x86; /* CPU family */ __u8 x86_vendor; /* CPU vendor */ __u8 x86_model; __u8 x86_stepping; #ifdef CONFIG_X86_64 /* Number of 4K pages in DTLB/ITLB combined(in pages): */ int x86_tlbsize; #endif #ifdef CONFIG_X86_VMX_FEATURE_NAMES __u32 vmx_capability[NVMXINTS]; #endif __u8 x86_virt_bits; __u8 x86_phys_bits; /* CPUID returned core id bits: */ __u8 x86_coreid_bits; __u8 cu_id; /* Max extended CPUID function supported: */ __u32 extended_cpuid_level; /* Maximum supported CPUID level, -1=no CPUID: */ int cpuid_level; /* * Align to size of unsigned long because the x86_capability array * is passed to bitops which require the alignment. Use unnamed * union to enforce the array is aligned to size of unsigned long. */ union { __u32 x86_capability[NCAPINTS + NBUGINTS]; unsigned long x86_capability_alignment; }; char x86_vendor_id[16]; char x86_model_id[64]; /* in KB - valid for CPUS which support this call: */ unsigned int x86_cache_size; int x86_cache_alignment; /* In bytes */ /* Cache QoS architectural values, valid only on the BSP: */ int x86_cache_max_rmid; /* max index */ int x86_cache_occ_scale; /* scale to bytes */ int x86_cache_mbm_width_offset; int x86_power; unsigned long loops_per_jiffy; /* cpuid returned max cores value: */ u16 x86_max_cores; u16 apicid; u16 initial_apicid; u16 x86_clflush_size; /* number of cores as seen by the OS: */ u16 booted_cores; /* Physical processor id: */ u16 phys_proc_id; /* Logical processor id: */ u16 logical_proc_id; /* Core id: */ u16 cpu_core_id; u16 cpu_die_id; u16 logical_die_id; /* Index into per_cpu list: */ u16 cpu_index; u32 microcode; /* Address space bits used by the cache internally */ u8 x86_cache_bits; unsigned initialized : 1; } __randomize_layout; struct cpuid_regs { u32 eax, ebx, ecx, edx; }; enum cpuid_regs_idx { CPUID_EAX = 0, CPUID_EBX, CPUID_ECX, CPUID_EDX, }; #define X86_VENDOR_INTEL 0 #define X86_VENDOR_CYRIX 1 #define X86_VENDOR_AMD 2 #define X86_VENDOR_UMC 3 #define X86_VENDOR_CENTAUR 5 #define X86_VENDOR_TRANSMETA 7 #define X86_VENDOR_NSC 8 #define X86_VENDOR_HYGON 9 #define X86_VENDOR_ZHAOXIN 10 #define X86_VENDOR_NUM 11 #define X86_VENDOR_UNKNOWN 0xff /* * capabilities of CPUs */ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); #define cpu_data(cpu) per_cpu(cpu_info, cpu) #else #define cpu_info boot_cpu_data #define cpu_data(cpu) boot_cpu_data #endif extern const struct seq_operations cpuinfo_op; #define cache_line_size() (boot_cpu_data.x86_cache_alignment) extern void cpu_detect(struct cpuinfo_x86 *c); static inline unsigned long long l1tf_pfn_limit(void) { return BIT_ULL(boot_cpu_data.x86_cache_bits - 1 - PAGE_SHIFT); } extern void early_cpu_init(void); extern void identify_boot_cpu(void); extern void identify_secondary_cpu(struct cpuinfo_x86 *); extern void print_cpu_info(struct cpuinfo_x86 *); void print_cpu_msr(struct cpuinfo_x86 *); #ifdef CONFIG_X86_32 extern int have_cpuid_p(void); #else static inline int have_cpuid_p(void) { return 1; } #endif static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { /* ecx is often an input as well as an output. */ asm volatile("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx) : "memory"); } #define native_cpuid_reg(reg) \ static inline unsigned int native_cpuid_##reg(unsigned int op) \ { \ unsigned int eax = op, ebx, ecx = 0, edx; \ \ native_cpuid(&eax, &ebx, &ecx, &edx); \ \ return reg; \ } /* * Native CPUID functions returning a single datum. */ native_cpuid_reg(eax) native_cpuid_reg(ebx) native_cpuid_reg(ecx) native_cpuid_reg(edx) /* * Friendlier CR3 helpers. */ static inline unsigned long read_cr3_pa(void) { return __read_cr3() & CR3_ADDR_MASK; } static inline unsigned long native_read_cr3_pa(void) { return __native_read_cr3() & CR3_ADDR_MASK; } static inline void load_cr3(pgd_t *pgdir) { write_cr3(__sme_pa(pgdir)); } /* * Note that while the legacy 'TSS' name comes from 'Task State Segment', * on modern x86 CPUs the TSS also holds information important to 64-bit mode, * unrelated to the task-switch mechanism: */ #ifdef CONFIG_X86_32 /* This is the TSS defined by the hardware. */ struct x86_hw_tss { unsigned short back_link, __blh; unsigned long sp0; unsigned short ss0, __ss0h; unsigned long sp1; /* * We don't use ring 1, so ss1 is a convenient scratch space in * the same cacheline as sp0. We use ss1 to cache the value in * MSR_IA32_SYSENTER_CS. When we context switch * MSR_IA32_SYSENTER_CS, we first check if the new value being * written matches ss1, and, if it's not, then we wrmsr the new * value and update ss1. * * The only reason we context switch MSR_IA32_SYSENTER_CS is * that we set it to zero in vm86 tasks to avoid corrupting the * stack if we were to go through the sysenter path from vm86 * mode. */ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; unsigned long ip; unsigned long flags; unsigned long ax; unsigned long cx; unsigned long dx; unsigned long bx; unsigned long sp; unsigned long bp; unsigned long si; unsigned long di; unsigned short es, __esh; unsigned short cs, __csh; unsigned short ss, __ssh; unsigned short ds, __dsh; unsigned short fs, __fsh; unsigned short gs, __gsh; unsigned short ldt, __ldth; unsigned short trace; unsigned short io_bitmap_base; } __attribute__((packed)); #else struct x86_hw_tss { u32 reserved1; u64 sp0; /* * We store cpu_current_top_of_stack in sp1 so it's always accessible. * Linux does not use ring 1, so sp1 is not otherwise needed. */ u64 sp1; /* * Since Linux does not use ring 2, the 'sp2' slot is unused by * hardware. entry_SYSCALL_64 uses it as scratch space to stash * the user RSP value. */ u64 sp2; u64 reserved2; u64 ist[7]; u32 reserved3; u32 reserved4; u16 reserved5; u16 io_bitmap_base; } __attribute__((packed)); #endif /* * IO-bitmap sizes: */ #define IO_BITMAP_BITS 65536 #define IO_BITMAP_BYTES (IO_BITMAP_BITS / BITS_PER_BYTE) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES / sizeof(long)) #define IO_BITMAP_OFFSET_VALID_MAP \ (offsetof(struct tss_struct, io_bitmap.bitmap) - \ offsetof(struct tss_struct, x86_tss)) #define IO_BITMAP_OFFSET_VALID_ALL \ (offsetof(struct tss_struct, io_bitmap.mapall) - \ offsetof(struct tss_struct, x86_tss)) #ifdef CONFIG_X86_IOPL_IOPERM /* * sizeof(unsigned long) coming from an extra "long" at the end of the * iobitmap. The limit is inclusive, i.e. the last valid byte. */ # define __KERNEL_TSS_LIMIT \ (IO_BITMAP_OFFSET_VALID_ALL + IO_BITMAP_BYTES + \ sizeof(unsigned long) - 1) #else # define __KERNEL_TSS_LIMIT \ (offsetof(struct tss_struct, x86_tss) + sizeof(struct x86_hw_tss) - 1) #endif /* Base offset outside of TSS_LIMIT so unpriviledged IO causes #GP */ #define IO_BITMAP_OFFSET_INVALID (__KERNEL_TSS_LIMIT + 1) struct entry_stack { char stack[PAGE_SIZE]; }; struct entry_stack_page { struct entry_stack stack; } __aligned(PAGE_SIZE); /* * All IO bitmap related data stored in the TSS: */ struct x86_io_bitmap { /* The sequence number of the last active bitmap. */ u64 prev_sequence; /* * Store the dirty size of the last io bitmap offender. The next * one will have to do the cleanup as the switch out to a non io * bitmap user will just set x86_tss.io_bitmap_base to a value * outside of the TSS limit. So for sane tasks there is no need to * actually touch the io_bitmap at all. */ unsigned int prev_max; /* * The extra 1 is there because the CPU will access an * additional byte beyond the end of the IO permission * bitmap. The extra byte must be all 1 bits, and must * be within the limit. */ unsigned long bitmap[IO_BITMAP_LONGS + 1]; /* * Special I/O bitmap to emulate IOPL(3). All bytes zero, * except the additional byte at the end. */ unsigned long mapall[IO_BITMAP_LONGS + 1]; }; struct tss_struct { /* * The fixed hardware portion. This must not cross a page boundary * at risk of violating the SDM's advice and potentially triggering * errata. */ struct x86_hw_tss x86_tss; struct x86_io_bitmap io_bitmap; } __aligned(PAGE_SIZE); DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /* Per CPU interrupt stacks */ struct irq_stack { char stack[IRQ_STACK_SIZE]; } __aligned(IRQ_STACK_SIZE); DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr); #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); #else /* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ #define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif #ifdef CONFIG_X86_64 struct fixed_percpu_data { /* * GCC hardcodes the stack canary as %gs:40. Since the * irq_stack is the object at %gs:0, we reserve the bottom * 48 bytes of the irq stack for the canary. */ char gs_base[40]; unsigned long stack_canary; }; DECLARE_PER_CPU_FIRST(struct fixed_percpu_data, fixed_percpu_data) __visible; DECLARE_INIT_PER_CPU(fixed_percpu_data); static inline unsigned long cpu_kernelmode_gs_base(int cpu) { return (unsigned long)per_cpu(fixed_percpu_data.gs_base, cpu); } DECLARE_PER_CPU(unsigned int, irq_count); extern asmlinkage void ignore_sysret(void); /* Save actual FS/GS selectors and bases to current->thread */ void current_save_fsgs(void); #else /* X86_64 */ #ifdef CONFIG_STACKPROTECTOR /* * Make sure stack canary segment base is cached-aligned: * "For Intel Atom processors, avoid non zero segment base address * that is not aligned to cache line boundary at all cost." * (Optim Ref Manual Assembly/Compiler Coding Rule 15.) */ struct stack_canary { char __pad[20]; /* canary at %gs:20 */ unsigned long canary; }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif /* Per CPU softirq stack pointer */ DECLARE_PER_CPU(struct irq_stack *, softirq_stack_ptr); #endif /* X86_64 */ extern unsigned int fpu_kernel_xstate_size; extern unsigned int fpu_user_xstate_size; struct perf_event; struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; #ifdef CONFIG_X86_32 unsigned long sp0; #endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else unsigned short es; unsigned short ds; unsigned short fsindex; unsigned short gsindex; #endif #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; #else /* * XXX: this could presumably be unsigned short. Alternatively, * 32-bit kernels could be taught to use fsindex instead. */ unsigned long fs; unsigned long gs; #endif /* Save middle states of ptrace breakpoints */ struct perf_event *ptrace_bps[HBP_NUM]; /* Debug status used for traps, single steps, etc... */ unsigned long virtual_dr6; /* Keep track of the exact dr7 value set by the user */ unsigned long ptrace_dr7; /* Fault info: */ unsigned long cr2; unsigned long trap_nr; unsigned long error_code; #ifdef CONFIG_VM86 /* Virtual 86 mode info */ struct vm86 *vm86; #endif /* IO permissions: */ struct io_bitmap *io_bitmap; /* * IOPL. Priviledge level dependent I/O permission which is * emulated via the I/O bitmap to prevent user space from disabling * interrupts. */ unsigned long iopl_emul; unsigned int sig_on_uaccess_err:1; /* Floating point and extended processor state */ struct fpu fpu; /* * WARNING: 'fpu' is dynamically-sized. It *MUST* be at * the end. */ }; /* Whitelist the FPU state from the task_struct for hardened usercopy. */ static inline void arch_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { *offset = offsetof(struct thread_struct, fpu.state); *size = fpu_kernel_xstate_size; } static inline void native_load_sp0(unsigned long sp0) { this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } static __always_inline void native_swapgs(void) { #ifdef CONFIG_X86_64 asm volatile("swapgs" ::: "memory"); #endif } static inline unsigned long current_top_of_stack(void) { /* * We can't read directly from tss.sp0: sp0 on x86_32 is special in * and around vm86 mode and sp0 on x86_64 is special because of the * entry trampoline. */ return this_cpu_read_stable(cpu_current_top_of_stack); } static inline bool on_thread_stack(void) { return (unsigned long)(current_top_of_stack() - current_stack_pointer) < THREAD_SIZE; } #ifdef CONFIG_PARAVIRT_XXL #include <asm/paravirt.h> #else #define __cpuid native_cpuid static inline void load_sp0(unsigned long sp0) { native_load_sp0(sp0); } #endif /* CONFIG_PARAVIRT_XXL */ /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); unsigned long get_wchan(struct task_struct *p); /* * Generic CPUID function * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx * resulting in stale register contents being returned. */ static inline void cpuid(unsigned int op, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = 0; __cpuid(eax, ebx, ecx, edx); } /* Some CPUID calls want 'count' to be placed in ecx */ static inline void cpuid_count(unsigned int op, int count, unsigned int *eax, unsigned int *ebx, unsigned int *ecx, unsigned int *edx) { *eax = op; *ecx = count; __cpuid(eax, ebx, ecx, edx); } /* * CPUID functions returning a single datum */ static inline unsigned int cpuid_eax(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return eax; } static inline unsigned int cpuid_ebx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return ebx; } static inline unsigned int cpuid_ecx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return ecx; } static inline unsigned int cpuid_edx(unsigned int op) { unsigned int eax, ebx, ecx, edx; cpuid(op, &eax, &ebx, &ecx, &edx); return edx; } extern void select_idle_routine(const struct cpuinfo_x86 *c); extern void amd_e400_c1e_apic_setup(void); extern unsigned long boot_option_idle_override; enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, IDLE_POLL}; extern void enable_sep_cpu(void); extern int sysenter_setup(void); /* Defined in head.S */ extern struct desc_ptr early_gdt_descr; extern void switch_to_new_gdt(int); extern void load_direct_gdt(int); extern void load_fixmap_gdt(int); extern void load_percpu_segment(int); extern void cpu_init(void); extern void cpu_init_exception_handling(void); extern void cr4_init(void); static inline unsigned long get_debugctlmsr(void) { unsigned long debugctlmsr = 0; #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return 0; #endif rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); return debugctlmsr; } static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR if (boot_cpu_data.x86 < 6) return; #endif wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } extern void set_task_blockstep(struct task_struct *task, bool on); /* Boot loader type from the setup header: */ extern int bootloader_type; extern int bootloader_version; extern char ignore_fpu_irq; #define HAVE_ARCH_PICK_MMAP_LAYOUT 1 #define ARCH_HAS_PREFETCHW #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 # define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else # define BASE_PREFETCH "prefetcht0 %P1" #endif /* * Prefetch instructions for Pentium III (+) and AMD Athlon (+) * * It's not worth to care about 3dnow prefetches for the K6 * because they are microcoded there and very slow. */ static inline void prefetch(const void *x) { alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, "m" (*(const char *)x)); } /* * 3dnow prefetch to get an exclusive cache line. * Useful for spinlocks to avoid one state transition in the * cache coherency protocol: */ static __always_inline void prefetchw(const void *x) { alternative_input(BASE_PREFETCH, "prefetchw %P1", X86_FEATURE_3DNOWPREFETCH, "m" (*(const char *)x)); } static inline void spin_lock_prefetch(const void *x) { prefetchw(x); } #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) #define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) #define task_pt_regs(task) \ ({ \ unsigned long __ptr = (unsigned long)task_stack_page(task); \ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ ((struct pt_regs *)__ptr) - 1; \ }) #ifdef CONFIG_X86_32 #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ } #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else #define INIT_THREAD { } extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp); /* * This decides where the kernel will search for a free chunk of vm * space during mmap's. */ #define __TASK_UNMAPPED_BASE(task_size) (PAGE_ALIGN(task_size / 3)) #define TASK_UNMAPPED_BASE __TASK_UNMAPPED_BASE(TASK_SIZE_LOW) #define KSTK_EIP(task) (task_pt_regs(task)->ip) /* Get/set a process' ability to use the timestamp counter instruction */ #define GET_TSC_CTL(adr) get_tsc_mode((adr)) #define SET_TSC_CTL(val) set_tsc_mode((val)) extern int get_tsc_mode(unsigned long adr); extern int set_tsc_mode(unsigned int val); DECLARE_PER_CPU(u64, msr_misc_features_shadow); #ifdef CONFIG_CPU_SUP_AMD extern u16 amd_get_nb_id(int cpu); extern u32 amd_get_nodes_per_socket(void); #else static inline u16 amd_get_nb_id(int cpu) { return 0; } static inline u32 amd_get_nodes_per_socket(void) { return 0; } #endif static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { uint32_t base, eax, signature[3]; for (base = 0x40000000; base < 0x40010000; base += 0x100) { cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); if (!memcmp(sig, signature, 12) && (leaves == 0 || ((eax - base) >= leaves))) return base; } return 0; } extern unsigned long arch_align_stack(unsigned long sp); void free_init_pages(const char *what, unsigned long begin, unsigned long end); extern void free_kernel_image_pages(const char *what, void *begin, void *end); void default_idle(void); #ifdef CONFIG_XEN bool xen_set_default_idle(void); #else #define xen_set_default_idle 0 #endif void stop_this_cpu(void *dummy); void microcode_check(void); enum l1tf_mitigations { L1TF_MITIGATION_OFF, L1TF_MITIGATION_FLUSH_NOWARN, L1TF_MITIGATION_FLUSH, L1TF_MITIGATION_FLUSH_NOSMT, L1TF_MITIGATION_FULL, L1TF_MITIGATION_FULL_FORCE }; extern enum l1tf_mitigations l1tf_mitigation; enum mds_mitigations { MDS_MITIGATION_OFF, MDS_MITIGATION_FULL, MDS_MITIGATION_VMWERV, }; #endif /* _ASM_X86_PROCESSOR_H */
1 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057 2058 2059 2060 2061 2062 2063 2064 2065 2066 2067 2068 2069 2070 2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 // SPDX-License-Identifier: GPL-2.0-only /* * linux/fs/nfs/inode.c * * Copyright (C) 1992 Rick Sladkey * * nfs inode and superblock handling functions * * Modularised by Alan Cox <alan@lxorguk.ukuu.org.uk>, while hacking some * experimental NFS changes. Modularisation taken straight from SYS5 fs. * * Change to nfs_read_super() to permit NFS mounts to multi-homed hosts. * J.S.Peatfield@damtp.cam.ac.uk * */ #include <linux/module.h> #include <linux/init.h> #include <linux/sched/signal.h> #include <linux/time.h> #include <linux/kernel.h> #include <linux/mm.h> #include <linux/string.h> #include <linux/stat.h> #include <linux/errno.h> #include <linux/unistd.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/metrics.h> #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include <linux/nfs4_mount.h> #include <linux/lockd/bind.h> #include <linux/seq_file.h> #include <linux/mount.h> #include <linux/vfs.h> #include <linux/inet.h> #include <linux/nfs_xdr.h> #include <linux/slab.h> #include <linux/compat.h> #include <linux/freezer.h> #include <linux/uaccess.h> #include <linux/iversion.h> #include "nfs4_fs.h" #include "callback.h" #include "delegation.h" #include "iostat.h" #include "internal.h" #include "fscache.h" #include "pnfs.h" #include "nfs.h" #include "netns.h" #include "sysfs.h" #include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_VFS #define NFS_64_BIT_INODE_NUMBERS_ENABLED 1 /* Default is to see 64-bit inode numbers */ static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED; static int nfs_update_inode(struct inode *, struct nfs_fattr *); static struct kmem_cache * nfs_inode_cachep; static inline unsigned long nfs_fattr_to_ino_t(struct nfs_fattr *fattr) { return nfs_fileid_to_ino_t(fattr->fileid); } static int nfs_wait_killable(int mode) { freezable_schedule_unsafe(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { return nfs_wait_killable(mode); } EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid * * This function returns a 32-bit inode number if the boot parameter * nfs.enable_ino64 is zero. */ u64 nfs_compat_user_ino64(u64 fileid) { #ifdef CONFIG_COMPAT compat_ulong_t ino; #else unsigned long ino; #endif if (enable_ino64) return fileid; ino = fileid; if (sizeof(ino) < sizeof(fileid)) ino ^= fileid >> (sizeof(fileid)-sizeof(ino)) * 8; return ino; } int nfs_drop_inode(struct inode *inode) { return NFS_STALE(inode) || generic_drop_inode(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); void nfs_clear_inode(struct inode *inode) { /* * The following should never happen... */ WARN_ON_ONCE(nfs_have_writebacks(inode)); WARN_ON_ONCE(!list_empty(&NFS_I(inode)->open_files)); nfs_zap_acl_cache(inode); nfs_access_zap_cache(inode); nfs_fscache_clear_inode(inode); } EXPORT_SYMBOL_GPL(nfs_clear_inode); void nfs_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); clear_inode(inode); nfs_clear_inode(inode); } int nfs_sync_inode(struct inode *inode) { inode_dio_wait(inode); return nfs_wb_all(inode); } EXPORT_SYMBOL_GPL(nfs_sync_inode); /** * nfs_sync_mapping - helper to flush all mmapped dirty data to disk * @mapping: pointer to struct address_space */ int nfs_sync_mapping(struct address_space *mapping) { int ret = 0; if (mapping->nrpages != 0) { unmap_mapping_range(mapping, 0, 0, 0); ret = nfs_wb_all(mapping->host); } return ret; } static int nfs_attribute_timeout(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); /* Special case for the pagecache or access cache */ if (flags == NFS_INO_REVAL_PAGECACHE && !(cache_validity & NFS_INO_REVAL_FORCED)) return false; return (cache_validity & flags) != 0; } static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); if ((cache_validity & flags) != 0) return true; if (nfs_attribute_timeout(inode)) return true; return false; } bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) { if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return nfs_check_cache_invalid_delegated(inode, flags); return nfs_check_cache_invalid_not_delegated(inode, flags); } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) { struct nfs_inode *nfsi = NFS_I(inode); bool have_delegation = NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); if (have_delegation) { if (!(flags & NFS_INO_REVAL_FORCED)) flags &= ~NFS_INO_INVALID_OTHER; flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_XATTR); } else if (flags & NFS_INO_REVAL_PAGECACHE) flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); nfsi->cache_validity |= flags; if (flags & NFS_INO_INVALID_DATA) nfs_fscache_invalidate(inode); } /* * Invalidate the local caches */ static void nfs_zap_caches_locked(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); int mode = inode->i_mode; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = jiffies; memset(NFS_I(inode)->cookieverf, 0, sizeof(NFS_I(inode)->cookieverf)); if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) { nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); } else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR | NFS_INO_REVAL_PAGECACHE); nfs_zap_label_cache_locked(nfsi); } void nfs_zap_caches(struct inode *inode) { spin_lock(&inode->i_lock); nfs_zap_caches_locked(inode); spin_unlock(&inode->i_lock); } void nfs_zap_mapping(struct inode *inode, struct address_space *mapping) { if (mapping->nrpages != 0) { spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); spin_unlock(&inode->i_lock); } } void nfs_zap_acl_cache(struct inode *inode) { void (*clear_acl_cache)(struct inode *); clear_acl_cache = NFS_PROTO(inode)->clear_acl_cache; if (clear_acl_cache != NULL) clear_acl_cache(inode); spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_zap_acl_cache); void nfs_invalidate_atime(struct inode *inode) { spin_lock(&inode->i_lock); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_invalidate_atime); /* * Invalidate, but do not unhash, the inode. * NB: must be called with inode->i_lock held! */ static void nfs_set_inode_stale_locked(struct inode *inode) { set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); nfs_zap_caches_locked(inode); trace_nfs_set_inode_stale(inode); } void nfs_set_inode_stale(struct inode *inode) { spin_lock(&inode->i_lock); nfs_set_inode_stale_locked(inode); spin_unlock(&inode->i_lock); } struct nfs_find_desc { struct nfs_fh *fh; struct nfs_fattr *fattr; }; /* * In NFSv3 we can have 64bit inode numbers. In order to support * this, and re-exported directories (also seen in NFSv2) * we are forced to allow 2 different inodes to have the same * i_ino. */ static int nfs_find_actor(struct inode *inode, void *opaque) { struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; struct nfs_fh *fh = desc->fh; struct nfs_fattr *fattr = desc->fattr; if (NFS_FILEID(inode) != fattr->fileid) return 0; if (inode_wrong_type(inode, fattr->mode)) return 0; if (nfs_compare_fh(NFS_FH(inode), fh)) return 0; if (is_bad_inode(inode) || NFS_STALE(inode)) return 0; return 1; } static int nfs_init_locked(struct inode *inode, void *opaque) { struct nfs_find_desc *desc = (struct nfs_find_desc *)opaque; struct nfs_fattr *fattr = desc->fattr; set_nfs_fileid(inode, fattr->fileid); inode->i_mode = fattr->mode; nfs_copy_fh(NFS_FH(inode), desc->fh); return 0; } #ifdef CONFIG_NFS_V4_SECURITY_LABEL static void nfs_clear_label_invalid(struct inode *inode) { spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_LABEL; spin_unlock(&inode->i_lock); } void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label) { int error; if (label == NULL) return; if ((fattr->valid & NFS_ATTR_FATTR_V4_SECURITY_LABEL) && inode->i_security) { error = security_inode_notifysecctx(inode, label->label, label->len); if (error) printk(KERN_ERR "%s() %s %d " "security_inode_notifysecctx() %d\n", __func__, (char *)label->label, label->len, error); nfs_clear_label_invalid(inode); } } struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags) { struct nfs4_label *label = NULL; int minor_version = server->nfs_client->cl_minorversion; if (minor_version < 2) return label; if (!(server->caps & NFS_CAP_SECURITY_LABEL)) return label; label = kzalloc(sizeof(struct nfs4_label), flags); if (label == NULL) return ERR_PTR(-ENOMEM); label->label = kzalloc(NFS4_MAXLABELLEN, flags); if (label->label == NULL) { kfree(label); return ERR_PTR(-ENOMEM); } label->len = NFS4_MAXLABELLEN; return label; } EXPORT_SYMBOL_GPL(nfs4_label_alloc); #else void nfs_setsecurity(struct inode *inode, struct nfs_fattr *fattr, struct nfs4_label *label) { } #endif EXPORT_SYMBOL_GPL(nfs_setsecurity); /* Search for inode identified by fh, fileid and i_mode in inode cache. */ struct inode * nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh) { struct nfs_find_desc desc = { .fh = fh, .fattr = fattr, }; struct inode *inode; unsigned long hash; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID) || !(fattr->valid & NFS_ATTR_FATTR_TYPE)) return NULL; hash = nfs_fattr_to_ino_t(fattr); inode = ilookup5(sb, hash, nfs_find_actor, &desc); dprintk("%s: returning %p\n", __func__, inode); return inode; } /* * This is our front-end to iget that looks up inodes by file handle * instead of inode number. */ struct inode * nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, struct nfs4_label *label) { struct nfs_find_desc desc = { .fh = fh, .fattr = fattr }; struct inode *inode = ERR_PTR(-ENOENT); unsigned long hash; nfs_attr_check_mountpoint(sb, fattr); if (nfs_attr_use_mounted_on_fileid(fattr)) fattr->fileid = fattr->mounted_on_fileid; else if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0) goto out_no_inode; if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0) goto out_no_inode; hash = nfs_fattr_to_ino_t(fattr); inode = iget5_locked(sb, hash, nfs_find_actor, nfs_init_locked, &desc); if (inode == NULL) { inode = ERR_PTR(-ENOMEM); goto out_no_inode; } if (inode->i_state & I_NEW) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long now = jiffies; /* We set i_ino for the few things that still rely on it, * such as stat(2) */ inode->i_ino = hash; /* We can't support update_atime(), since the server will reset it */ inode->i_flags |= S_NOATIME|S_NOCMTIME; inode->i_mode = fattr->mode; nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 && nfs_server_capable(inode, NFS_CAP_MODE)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->file_inode_ops; if (S_ISREG(inode->i_mode)) { inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops; inode->i_data.a_ops = &nfs_file_aops; } else if (S_ISDIR(inode->i_mode)) { inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops; inode->i_fop = &nfs_dir_operations; inode->i_data.a_ops = &nfs_dir_aops; /* Deal with crossing mountpoints */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT || fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) inode->i_op = &nfs_referral_inode_operations; else inode->i_op = &nfs_mountpoint_inode_operations; inode->i_fop = NULL; inode->i_flags |= S_AUTOMOUNT; } } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &nfs_symlink_inode_operations; inode_nohighmem(inode); } else init_special_inode(inode, inode->i_mode, fattr->rdev); memset(&inode->i_atime, 0, sizeof(inode->i_atime)); memset(&inode->i_mtime, 0, sizeof(inode->i_mtime)); memset(&inode->i_ctime, 0, sizeof(inode->i_ctime)); inode_set_iversion_raw(inode, 0); inode->i_size = 0; clear_nlink(inode); inode->i_uid = make_kuid(&init_user_ns, -2); inode->i_gid = make_kgid(&init_user_ns, -2); inode->i_blocks = 0; memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); nfsi->write_io = 0; nfsi->read_io = 0; nfsi->read_cache_jiffies = fattr->time_start; nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (nfs_server_capable(inode, NFS_CAP_ATIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (nfs_server_capable(inode, NFS_CAP_MTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else if (nfs_server_capable(inode, NFS_CAP_CTIME)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); else if (nfs_server_capable(inode, NFS_CAP_NLINK)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (nfs_server_capable(inode, NFS_CAP_OWNER)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (nfs_server_capable(inode, NFS_CAP_XATTR)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } if (nfsi->cache_validity != 0) nfsi->cache_validity |= NFS_INO_REVAL_FORCED; nfs_setsecurity(inode, fattr, label); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; nfsi->access_cache = RB_ROOT; nfs_fscache_init_inode(inode); unlock_new_inode(inode); } else { int err = nfs_refresh_inode(inode, fattr); if (err < 0) { iput(inode); inode = ERR_PTR(err); goto out_no_inode; } } dprintk("NFS: nfs_fhget(%s/%Lu fh_crc=0x%08x ct=%d)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), nfs_display_fhandle_hash(fh), atomic_read(&inode->i_count)); out: return inode; out_no_inode: dprintk("nfs_fhget: iget failed with error %ld\n", PTR_ERR(inode)); goto out; } EXPORT_SYMBOL_GPL(nfs_fhget); #define NFS_VALID_ATTRS (ATTR_MODE|ATTR_UID|ATTR_GID|ATTR_SIZE|ATTR_ATIME|ATTR_ATIME_SET|ATTR_MTIME|ATTR_MTIME_SET|ATTR_FILE|ATTR_OPEN) int nfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); struct nfs_fattr *fattr; int error = 0; nfs_inc_stats(inode, NFSIOS_VFSSETATTR); /* skip mode change if it's just for clearing setuid/setgid */ if (attr->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { BUG_ON(!S_ISREG(inode->i_mode)); error = inode_newsize_ok(inode, attr->ia_size); if (error) return error; if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; } /* Optimization: if the end result is no change, don't RPC */ attr->ia_valid &= NFS_VALID_ATTRS; if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; trace_nfs_setattr_enter(inode); /* Write all dirty data */ if (S_ISREG(inode->i_mode)) nfs_sync_inode(inode); fattr = nfs_alloc_fattr(); if (fattr == NULL) { error = -ENOMEM; goto out; } error = NFS_PROTO(inode)->setattr(dentry, fattr, attr); if (error == 0) error = nfs_refresh_inode(inode, fattr); nfs_free_fattr(fattr); out: trace_nfs_setattr_exit(inode, error); return error; } EXPORT_SYMBOL_GPL(nfs_setattr); /** * nfs_vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used * @offset: file offset to start truncating * * This is a copy of the common vmtruncate, but with the locking * corrected to take into account the fact that NFS requires * inode->i_size to be updated under the inode->i_lock. * Note: must be called with inode->i_lock held! */ static int nfs_vmtruncate(struct inode * inode, loff_t offset) { int err; err = inode_newsize_ok(inode, offset); if (err) goto out; i_size_write(inode, offset); /* Optimisation */ if (offset == 0) NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_DATA | NFS_INO_DATA_INVAL_DEFER); NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; spin_unlock(&inode->i_lock); truncate_pagecache(inode, offset); spin_lock(&inode->i_lock); out: return err; } /** * nfs_setattr_update_inode - Update inode metadata after a setattr call. * @inode: pointer to struct inode * @attr: pointer to struct iattr * @fattr: pointer to struct nfs_fattr * * Note: we do this in the *proc.c in order to ensure that * it works for things like exclusive creates too. */ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, struct nfs_fattr *fattr) { /* Barrier: bump the attribute generation count. */ nfs_fattr_set_barrier(fattr); spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & ATTR_SIZE) != 0) { nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; inode->i_mode = mode; } if ((attr->ia_valid & ATTR_UID) != 0) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); nfs_set_cache_invalid(inode, NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL); } if (attr->ia_valid & (ATTR_ATIME_SET|ATTR_ATIME)) { NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (attr->ia_valid & ATTR_ATIME_SET) inode->i_atime = attr->ia_atime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); } if (attr->ia_valid & (ATTR_MTIME_SET|ATTR_MTIME)) { NFS_I(inode)->cache_validity &= ~(NFS_INO_INVALID_MTIME | NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; else if (attr->ia_valid & ATTR_MTIME_SET) inode->i_mtime = attr->ia_mtime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; else nfs_set_cache_invalid(inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME); } if (fattr->valid) nfs_update_inode(inode, fattr); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_setattr_update_inode); static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry) { struct dentry *parent; if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) return; parent = dget_parent(dentry); nfs_force_use_readdirplus(d_inode(parent)); dput(parent); } static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) { struct dentry *parent; if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS)) return; parent = dget_parent(dentry); nfs_advise_use_readdirplus(d_inode(parent)); dput(parent); } static bool nfs_need_revalidate_inode(struct inode *inode) { if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) return true; if (nfs_attribute_cache_expired(inode)) return true; return false; } int nfs_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { struct inode *inode = d_inode(path->dentry); struct nfs_server *server = NFS_SERVER(inode); unsigned long cache_validity; int err = 0; bool force_sync = query_flags & AT_STATX_FORCE_SYNC; bool do_update = false; trace_nfs_getattr_enter(inode); if ((query_flags & AT_STATX_DONT_SYNC) && !force_sync) { nfs_readdirplus_parent_cache_hit(path->dentry); goto out_no_update; } /* Flush out writes to the server in order to update c/mtime. */ if ((request_mask & (STATX_CTIME|STATX_MTIME)) && S_ISREG(inode->i_mode)) { err = filemap_write_and_wait(inode->i_mapping); if (err) goto out; } /* * We may force a getattr if the user cares about atime. * * Note that we only have to check the vfsmount flags here: * - NFS always sets S_NOATIME by so checking it would give a * bogus result * - NFS never sets SB_NOATIME or SB_NODIRATIME so there is * no point in checking those. */ if ((path->mnt->mnt_flags & MNT_NOATIME) || ((path->mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) request_mask &= ~STATX_ATIME; /* Is the user requesting attributes that might need revalidation? */ if (!(request_mask & (STATX_MODE|STATX_NLINK|STATX_ATIME|STATX_CTIME| STATX_MTIME|STATX_UID|STATX_GID| STATX_SIZE|STATX_BLOCKS))) goto out_no_revalidate; /* Check whether the cached attributes are stale */ do_update |= force_sync || nfs_attribute_cache_expired(inode); cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); do_update |= cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL); if (request_mask & STATX_ATIME) do_update |= cache_validity & NFS_INO_INVALID_ATIME; if (request_mask & (STATX_CTIME|STATX_MTIME)) do_update |= cache_validity & NFS_INO_REVAL_PAGECACHE; if (request_mask & STATX_BLOCKS) do_update |= cache_validity & NFS_INO_INVALID_BLOCKS; if (do_update) { /* Update the attribute cache */ if (!(server->flags & NFS_MOUNT_NOAC)) nfs_readdirplus_parent_cache_miss(path->dentry); else nfs_readdirplus_parent_cache_hit(path->dentry); err = __nfs_revalidate_inode(server, inode); if (err) goto out; } else nfs_readdirplus_parent_cache_hit(path->dentry); out_no_revalidate: /* Only return attributes that were revalidated. */ stat->result_mask &= request_mask; out_no_update: generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; out: trace_nfs_getattr_exit(inode, err); return err; } EXPORT_SYMBOL_GPL(nfs_getattr); static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) { refcount_set(&l_ctx->count, 1); l_ctx->lockowner = current->files; INIT_LIST_HEAD(&l_ctx->list); atomic_set(&l_ctx->io_count, 0); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *pos; list_for_each_entry_rcu(pos, &ctx->lock_context.list, list) { if (pos->lockowner != current->files) continue; if (refcount_inc_not_zero(&pos->count)) return pos; } return NULL; } struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) { struct nfs_lock_context *res, *new = NULL; struct inode *inode = d_inode(ctx->dentry); rcu_read_lock(); res = __nfs_find_lock_context(ctx); rcu_read_unlock(); if (res == NULL) { new = kmalloc(sizeof(*new), GFP_KERNEL); if (new == NULL) return ERR_PTR(-ENOMEM); nfs_init_lock_context(new); spin_lock(&inode->i_lock); res = __nfs_find_lock_context(ctx); if (res == NULL) { new->open_context = get_nfs_open_context(ctx); if (new->open_context) { list_add_tail_rcu(&new->list, &ctx->lock_context.list); res = new; new = NULL; } else res = ERR_PTR(-EBADF); } spin_unlock(&inode->i_lock); kfree(new); } return res; } EXPORT_SYMBOL_GPL(nfs_get_lock_context); void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { struct nfs_open_context *ctx = l_ctx->open_context; struct inode *inode = d_inode(ctx->dentry); if (!refcount_dec_and_lock(&l_ctx->count, &inode->i_lock)) return; list_del_rcu(&l_ctx->list); spin_unlock(&inode->i_lock); put_nfs_open_context(ctx); kfree_rcu(l_ctx, rcu_head); } EXPORT_SYMBOL_GPL(nfs_put_lock_context); /** * nfs_close_context - Common close_context() routine NFSv2/v3 * @ctx: pointer to context * @is_sync: is this a synchronous close * * Ensure that the attributes are up to date if we're mounted * with close-to-open semantics and we have cached data that will * need to be revalidated on open. */ void nfs_close_context(struct nfs_open_context *ctx, int is_sync) { struct nfs_inode *nfsi; struct inode *inode; struct nfs_server *server; if (!(ctx->mode & FMODE_WRITE)) return; if (!is_sync) return; inode = d_inode(ctx->dentry); if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return; nfsi = NFS_I(inode); if (inode->i_mapping->nrpages == 0) return; if (nfsi->cache_validity & NFS_INO_INVALID_DATA) return; if (!list_empty(&nfsi->open_files)) return; server = NFS_SERVER(inode); if (server->flags & NFS_MOUNT_NOCTO) return; nfs_revalidate_inode(server, inode); } EXPORT_SYMBOL_GPL(nfs_close_context); struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode, struct file *filp) { struct nfs_open_context *ctx; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); nfs_sb_active(dentry->d_sb); ctx->dentry = dget(dentry); if (filp) ctx->cred = get_cred(filp->f_cred); else ctx->cred = get_current_cred(); ctx->ll_cred = NULL; ctx->state = NULL; ctx->mode = f_mode; ctx->flags = 0; ctx->error = 0; ctx->flock_owner = (fl_owner_t)filp; nfs_init_lock_context(&ctx->lock_context); ctx->lock_context.open_context = ctx; INIT_LIST_HEAD(&ctx->list); ctx->mdsthreshold = NULL; return ctx; } EXPORT_SYMBOL_GPL(alloc_nfs_open_context); struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL && refcount_inc_not_zero(&ctx->lock_context.count)) return ctx; return NULL; } EXPORT_SYMBOL_GPL(get_nfs_open_context); static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = d_inode(ctx->dentry); struct super_block *sb = ctx->dentry->d_sb; if (!refcount_dec_and_test(&ctx->lock_context.count)) return; if (!list_empty(&ctx->list)) { spin_lock(&inode->i_lock); list_del_rcu(&ctx->list); spin_unlock(&inode->i_lock); } if (inode != NULL) NFS_PROTO(inode)->close_context(ctx, is_sync); put_cred(ctx->cred); dput(ctx->dentry); nfs_sb_deactive(sb); put_rpccred(ctx->ll_cred); kfree(ctx->mdsthreshold); kfree_rcu(ctx, rcu_head); } void put_nfs_open_context(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 0); } EXPORT_SYMBOL_GPL(put_nfs_open_context); static void put_nfs_open_context_sync(struct nfs_open_context *ctx) { __put_nfs_open_context(ctx, 1); } /* * Ensure that mmap has a recent RPC credential for use when writing out * shared pages */ void nfs_inode_attach_open_context(struct nfs_open_context *ctx) { struct inode *inode = d_inode(ctx->dentry); struct nfs_inode *nfsi = NFS_I(inode); spin_lock(&inode->i_lock); if (list_empty(&nfsi->open_files) && (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER)) nfsi->cache_validity |= NFS_INO_INVALID_DATA | NFS_INO_REVAL_FORCED; list_add_tail_rcu(&ctx->list, &nfsi->open_files); spin_unlock(&inode->i_lock); } EXPORT_SYMBOL_GPL(nfs_inode_attach_open_context); void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx) { filp->private_data = get_nfs_open_context(ctx); set_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); if (list_empty(&ctx->list)) nfs_inode_attach_open_context(ctx); } EXPORT_SYMBOL_GPL(nfs_file_set_open_context); /* * Given an inode, search for an open context with the desired characteristics */ struct nfs_open_context *nfs_find_open_context(struct inode *inode, const struct cred *cred, fmode_t mode) { struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_context *pos, *ctx = NULL; rcu_read_lock(); list_for_each_entry_rcu(pos, &nfsi->open_files, list) { if (cred != NULL && cred_fscmp(pos->cred, cred) != 0) continue; if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) continue; if (!test_bit(NFS_CONTEXT_FILE_OPEN, &pos->flags)) continue; ctx = get_nfs_open_context(pos); if (ctx) break; } rcu_read_unlock(); return ctx; } void nfs_file_clear_open_context(struct file *filp) { struct nfs_open_context *ctx = nfs_file_open_context(filp); if (ctx) { struct inode *inode = d_inode(ctx->dentry); clear_bit(NFS_CONTEXT_FILE_OPEN, &ctx->flags); /* * We fatal error on write before. Try to writeback * every page again. */ if (ctx->error < 0) invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; put_nfs_open_context_sync(ctx); } } /* * These allocate and release file read/write context information. */ int nfs_open(struct inode *inode, struct file *filp) { struct nfs_open_context *ctx; ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp); if (IS_ERR(ctx)) return PTR_ERR(ctx); nfs_file_set_open_context(filp, ctx); put_nfs_open_context(ctx); nfs_fscache_open_file(inode, filp); return 0; } EXPORT_SYMBOL_GPL(nfs_open); /* * This function is called whenever some part of NFS notices that * the cached attributes have to be refreshed. */ int __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { int status = -ESTALE; struct nfs4_label *label = NULL; struct nfs_fattr *fattr = NULL; struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: revalidating (%s/%Lu)\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); trace_nfs_revalidate_inode_enter(inode); if (is_bad_inode(inode)) goto out; if (NFS_STALE(inode)) goto out; /* pNFS: Attributes aren't updated until we layoutcommit */ if (S_ISREG(inode->i_mode)) { status = pnfs_sync_inode(inode, false); if (status) goto out; } status = -ENOMEM; fattr = nfs_alloc_fattr(); if (fattr == NULL) goto out; nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL); if (IS_ERR(label)) { status = PTR_ERR(label); goto out; } status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), fattr, label, inode); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) getattr failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); switch (status) { case -ETIMEDOUT: /* A soft timeout occurred. Use cached information? */ if (server->flags & NFS_MOUNT_SOFTREVAL) status = 0; break; case -ESTALE: if (!S_ISDIR(inode->i_mode)) nfs_set_inode_stale(inode); else nfs_zap_caches(inode); } goto err_out; } status = nfs_refresh_inode(inode, fattr); if (status) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Lu) refresh failed, error=%d\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode), status); goto err_out; } if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); nfs_setsecurity(inode, fattr, label); dfprintk(PAGECACHE, "NFS: (%s/%Lu) revalidation complete\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); err_out: nfs4_label_free(label); out: nfs_free_fattr(fattr); trace_nfs_revalidate_inode_exit(inode, status); return status; } int nfs_attribute_cache_expired(struct inode *inode) { if (nfs_have_delegated_attributes(inode)) return 0; return nfs_attribute_timeout(inode); } /** * nfs_revalidate_inode - Revalidate the inode attributes * @server: pointer to nfs_server struct * @inode: pointer to inode struct * * Updates inode attribute information by retrieving the data from the server. */ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) { if (!nfs_need_revalidate_inode(inode)) return NFS_STALE(inode) ? -ESTALE : 0; return __nfs_revalidate_inode(server, inode); } EXPORT_SYMBOL_GPL(nfs_revalidate_inode); static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); int ret; if (mapping->nrpages != 0) { if (S_ISREG(inode->i_mode)) { ret = nfs_sync_mapping(mapping); if (ret < 0) return ret; } ret = invalidate_inode_pages2(mapping); if (ret < 0) return ret; } if (S_ISDIR(inode->i_mode)) { spin_lock(&inode->i_lock); memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf)); spin_unlock(&inode->i_lock); } nfs_inc_stats(inode, NFSIOS_DATAINVALIDATE); nfs_fscache_wait_on_invalidate(inode); dfprintk(PAGECACHE, "NFS: (%s/%Lu) data cache invalidated\n", inode->i_sb->s_id, (unsigned long long)NFS_FILEID(inode)); return 0; } bool nfs_mapping_need_revalidate_inode(struct inode *inode) { return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) || NFS_STALE(inode); } int nfs_revalidate_mapping_rcu(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; if (IS_SWAPFILE(inode)) goto out; if (nfs_mapping_need_revalidate_inode(inode)) { ret = -ECHILD; goto out; } spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock) || (nfsi->cache_validity & NFS_INO_INVALID_DATA)) ret = -ECHILD; spin_unlock(&inode->i_lock); out: return ret; } /** * nfs_revalidate_mapping - Revalidate the pagecache * @inode: pointer to host inode * @mapping: pointer to mapping */ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) { struct nfs_inode *nfsi = NFS_I(inode); unsigned long *bitlock = &nfsi->flags; int ret = 0; /* swapfiles are not supposed to be shared. */ if (IS_SWAPFILE(inode)) goto out; if (nfs_mapping_need_revalidate_inode(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) goto out; } /* * We must clear NFS_INO_INVALID_DATA first to ensure that * invalidations that come in while we're shooting down the mappings * are respected. But, that leaves a race window where one revalidator * can clear the flag, and then another checks it before the mapping * gets invalidated. Fix that by serializing access to this part of * the function. * * At the same time, we need to allow other tasks to see whether we * might be in the middle of invalidating the pages, so we only set * the bit lock here if it looks like we're going to be doing that. */ for (;;) { ret = wait_on_bit_action(bitlock, NFS_INO_INVALIDATING, nfs_wait_bit_killable, TASK_KILLABLE); if (ret) goto out; spin_lock(&inode->i_lock); if (test_bit(NFS_INO_INVALIDATING, bitlock)) { spin_unlock(&inode->i_lock); continue; } if (nfsi->cache_validity & NFS_INO_INVALID_DATA) break; spin_unlock(&inode->i_lock); goto out; } set_bit(NFS_INO_INVALIDATING, bitlock); smp_wmb(); nfsi->cache_validity &= ~(NFS_INO_INVALID_DATA| NFS_INO_DATA_INVAL_DEFER); spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); clear_bit_unlock(NFS_INO_INVALIDATING, bitlock); smp_mb__after_atomic(); wake_up_bit(bitlock, NFS_INO_INVALIDATING); out: return ret; } static bool nfs_file_has_writers(struct nfs_inode *nfsi) { struct inode *inode = &nfsi->vfs_inode; if (!S_ISREG(inode->i_mode)) return false; if (list_empty(&nfsi->open_files)) return false; return inode_is_open_for_write(inode); } static bool nfs_file_has_buffered_writers(struct nfs_inode *nfsi) { return nfs_file_has_writers(nfsi) && nfs_file_io_is_buffered(nfsi); } static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct timespec64 ts; if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE) && (fattr->valid & NFS_ATTR_FATTR_CHANGE) && inode_eq_iversion_raw(inode, fattr->pre_change_attr)) { inode_set_iversion_raw(inode, fattr->change_attr); if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); else if (nfs_server_capable(inode, NFS_CAP_XATTR)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); } /* If we have atomic WCC data, we may update some attributes */ ts = inode->i_ctime; if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME) && (fattr->valid & NFS_ATTR_FATTR_CTIME) && timespec64_equal(&ts, &fattr->pre_ctime)) { inode->i_ctime = fattr->ctime; } ts = inode->i_mtime; if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME) && (fattr->valid & NFS_ATTR_FATTR_MTIME) && timespec64_equal(&ts, &fattr->pre_mtime)) { inode->i_mtime = fattr->mtime; if (S_ISDIR(inode->i_mode)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_DATA); } if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE) && (fattr->valid & NFS_ATTR_FATTR_SIZE) && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) && !nfs_have_writebacks(inode)) { i_size_write(inode, nfs_size_to_loff_t(fattr->size)); } } /** * nfs_check_inode_attributes - verify consistency of the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * Verifies the attribute cache. If we have just changed the attributes, * so that fattr carries weak cache consistency data, then it may * also update the ctime/mtime/change_attribute. */ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_size, new_isize; unsigned long invalid = 0; struct timespec64 ts; if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) return 0; if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) return 0; return -ESTALE; } if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) return -ESTALE; if (!nfs_file_has_buffered_writers(nfsi)) { /* Verify a few of the more important attributes */ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && !inode_eq_iversion_raw(inode, fattr->change_attr)) invalid |= NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE; ts = inode->i_mtime; if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec64_equal(&ts, &fattr->mtime)) invalid |= NFS_INO_INVALID_MTIME; ts = inode->i_ctime; if ((fattr->valid & NFS_ATTR_FATTR_CTIME) && !timespec64_equal(&ts, &fattr->ctime)) invalid |= NFS_INO_INVALID_CTIME; if (fattr->valid & NFS_ATTR_FATTR_SIZE) { cur_size = i_size_read(inode); new_isize = nfs_size_to_loff_t(fattr->size); if (cur_size != new_isize) invalid |= NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE; } } /* Have any file permissions changed? */ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && !uid_eq(inode->i_uid, fattr->uid)) invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_OTHER; if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && !gid_eq(inode->i_gid, fattr->gid)) invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_OTHER; /* Has the link count changed? */ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink) invalid |= NFS_INO_INVALID_OTHER; ts = inode->i_atime; if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec64_equal(&ts, &fattr->atime)) invalid |= NFS_INO_INVALID_ATIME; if (invalid != 0) nfs_set_cache_invalid(inode, invalid); nfsi->read_cache_jiffies = fattr->time_start; return 0; } static atomic_long_t nfs_attr_generation_counter; static unsigned long nfs_read_attr_generation_counter(void) { return atomic_long_read(&nfs_attr_generation_counter); } unsigned long nfs_inc_attr_generation_counter(void) { return atomic_long_inc_return(&nfs_attr_generation_counter); } EXPORT_SYMBOL_GPL(nfs_inc_attr_generation_counter); void nfs_fattr_init(struct nfs_fattr *fattr) { fattr->valid = 0; fattr->time_start = jiffies; fattr->gencount = nfs_inc_attr_generation_counter(); fattr->owner_name = NULL; fattr->group_name = NULL; } EXPORT_SYMBOL_GPL(nfs_fattr_init); /** * nfs_fattr_set_barrier * @fattr: attributes * * Used to set a barrier after an attribute was updated. This * barrier ensures that older attributes from RPC calls that may * have raced with our update cannot clobber these new values. * Note that you are still responsible for ensuring that other * operations which change the attribute on the server do not * collide. */ void nfs_fattr_set_barrier(struct nfs_fattr *fattr) { fattr->gencount = nfs_inc_attr_generation_counter(); } struct nfs_fattr *nfs_alloc_fattr(void) { struct nfs_fattr *fattr; fattr = kmalloc(sizeof(*fattr), GFP_NOFS); if (fattr != NULL) nfs_fattr_init(fattr); return fattr; } EXPORT_SYMBOL_GPL(nfs_alloc_fattr); struct nfs_fh *nfs_alloc_fhandle(void) { struct nfs_fh *fh; fh = kmalloc(sizeof(struct nfs_fh), GFP_NOFS); if (fh != NULL) fh->size = 0; return fh; } EXPORT_SYMBOL_GPL(nfs_alloc_fhandle); #ifdef NFS_DEBUG /* * _nfs_display_fhandle_hash - calculate the crc32 hash for the filehandle * in the same way that wireshark does * * @fh: file handle * * For debugging only. */ u32 _nfs_display_fhandle_hash(const struct nfs_fh *fh) { /* wireshark uses 32-bit AUTODIN crc and does a bitwise * not on the result */ return nfs_fhandle_hash(fh); } EXPORT_SYMBOL_GPL(_nfs_display_fhandle_hash); /* * _nfs_display_fhandle - display an NFS file handle on the console * * @fh: file handle to display * @caption: display caption * * For debugging only. */ void _nfs_display_fhandle(const struct nfs_fh *fh, const char *caption) { unsigned short i; if (fh == NULL || fh->size == 0) { printk(KERN_DEFAULT "%s at %p is empty\n", caption, fh); return; } printk(KERN_DEFAULT "%s at %p is %u bytes, crc: 0x%08x:\n", caption, fh, fh->size, _nfs_display_fhandle_hash(fh)); for (i = 0; i < fh->size; i += 16) { __be32 *pos = (__be32 *)&fh->data[i]; switch ((fh->size - i - 1) >> 2) { case 0: printk(KERN_DEFAULT " %08x\n", be32_to_cpup(pos)); break; case 1: printk(KERN_DEFAULT " %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1)); break; case 2: printk(KERN_DEFAULT " %08x %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1), be32_to_cpup(pos + 2)); break; default: printk(KERN_DEFAULT " %08x %08x %08x %08x\n", be32_to_cpup(pos), be32_to_cpup(pos + 1), be32_to_cpup(pos + 2), be32_to_cpup(pos + 3)); } } } EXPORT_SYMBOL_GPL(_nfs_display_fhandle); #endif /** * nfs_inode_attrs_need_update - check if the inode attributes need updating * @inode: pointer to inode * @fattr: attributes * * Attempt to divine whether or not an RPC call reply carrying stale * attributes got scheduled after another call carrying updated ones. * * To do so, the function first assumes that a more recent ctime means * that the attributes in fattr are newer, however it also attempt to * catch the case where ctime either didn't change, or went backwards * (if someone reset the clock on the server) by looking at whether * or not this RPC call was started after the inode was last updated. * Note also the check for wraparound of 'attr_gencount' * * The function returns 'true' if it thinks the attributes in 'fattr' are * more recent than the ones cached in the inode. * */ static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) { unsigned long attr_gencount = NFS_I(inode)->attr_gencount; return (long)(fattr->gencount - attr_gencount) > 0 || (long)(attr_gencount - nfs_read_attr_generation_counter()) > 0; } static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) { int ret; trace_nfs_refresh_inode_enter(inode); if (nfs_inode_attrs_need_update(inode, fattr)) ret = nfs_update_inode(inode, fattr); else ret = nfs_check_inode_attributes(inode, fattr); trace_nfs_refresh_inode_exit(inode, ret); return ret; } /** * nfs_refresh_inode - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * Check that an RPC call that returned attributes has not overlapped with * other recent updates of the inode metadata, then decide whether it is * safe to do a full update of the inode attributes, or whether just to * call nfs_check_inode_attributes. */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_refresh_inode); static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr, unsigned int invalid) { if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; return nfs_refresh_inode_locked(inode, fattr); } /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. * * NB: if the server didn't return any post op attributes, this * function will force the retrieval of attributes before the next * NFS request. Thus it should be used only for operations that * are expected to change one or more attributes, to avoid * unnecessary NFS requests and trips through nfs_update_inode(). */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { int status; spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_locked(inode, fattr, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode); /** * nfs_post_op_update_inode_force_wcc_locked - update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up * weak cache consistency data, if none exist. * * This function is mainly designed to be used by the ->write_done() functions. */ int nfs_post_op_update_inode_force_wcc_locked(struct inode *inode, struct nfs_fattr *fattr) { int status; /* Don't do a WCC update if these attributes are already stale */ if ((fattr->valid & NFS_ATTR_FATTR) == 0 || !nfs_inode_attrs_need_update(inode, fattr)) { fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE | NFS_ATTR_FATTR_PRESIZE | NFS_ATTR_FATTR_PREMTIME | NFS_ATTR_FATTR_PRECTIME); goto out_noforce; } if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) { fattr->pre_change_attr = inode_peek_iversion_raw(inode); fattr->valid |= NFS_ATTR_FATTR_PRECHANGE; } if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) { fattr->pre_ctime = inode->i_ctime; fattr->valid |= NFS_ATTR_FATTR_PRECTIME; } if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 && (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) { fattr->pre_mtime = inode->i_mtime; fattr->valid |= NFS_ATTR_FATTR_PREMTIME; } if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 && (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) { fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_FATTR_PRESIZE; } out_noforce: status = nfs_post_op_update_inode_locked(inode, fattr, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_BLOCKS); return status; } /** * nfs_post_op_update_inode_force_wcc - try to update the inode attribute cache * @inode: pointer to inode * @fattr: updated attributes * * After an operation that has changed the inode metadata, mark the * attribute cache as being invalid, then try to update it. Fake up * weak cache consistency data, if none exist. * * This function is mainly designed to be used by the ->write_done() functions. */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { int status; spin_lock(&inode->i_lock); nfs_fattr_set_barrier(fattr); status = nfs_post_op_update_inode_force_wcc_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } EXPORT_SYMBOL_GPL(nfs_post_op_update_inode_force_wcc); /* * Many nfs protocol calls return the new file attributes after * an operation. Here we update the inode to reflect the state * of the server's inode. * * This is a bit tricky because we have to make sure all dirty pages * have been sent off to the server before calling invalidate_inode_pages. * To make sure no other process adds more write requests while we try * our best to flush them, we make them sleep during the attribute refresh. * * A very similar scenario holds for the dir cache. */ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) { struct nfs_server *server; struct nfs_inode *nfsi = NFS_I(inode); loff_t cur_isize, new_isize; unsigned long invalid = 0; unsigned long now = jiffies; unsigned long save_cache_validity; bool have_writers = nfs_file_has_buffered_writers(nfsi); bool cache_revalidated = true; bool attr_changed = false; bool have_delegation; dfprintk(VFS, "NFS: %s(%s/%lu fh_crc=0x%08x ct=%d info=0x%x)\n", __func__, inode->i_sb->s_id, inode->i_ino, nfs_display_fhandle_hash(NFS_FH(inode)), atomic_read(&inode->i_count), fattr->valid); if (!(fattr->valid & NFS_ATTR_FATTR_FILEID)) { /* Only a mounted-on-fileid? Just exit */ if (fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) return 0; /* Has the inode gone and changed behind our back? */ } else if (nfsi->fileid != fattr->fileid) { /* Is this perhaps the mounted-on fileid? */ if ((fattr->valid & NFS_ATTR_FATTR_MOUNTED_ON_FILEID) && nfsi->fileid == fattr->mounted_on_fileid) return 0; printk(KERN_ERR "NFS: server %s error: fileid changed\n" "fsid %s: expected fileid 0x%Lx, got 0x%Lx\n", NFS_SERVER(inode)->nfs_client->cl_hostname, inode->i_sb->s_id, (long long)nfsi->fileid, (long long)fattr->fileid); goto out_err; } /* * Make sure the inode's type hasn't changed. */ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && inode_wrong_type(inode, fattr->mode)) { /* * Big trouble! The inode has become a different object. */ printk(KERN_DEBUG "NFS: %s: inode %lu mode changed, %07o to %07o\n", __func__, inode->i_ino, inode->i_mode, fattr->mode); goto out_err; } server = NFS_SERVER(inode); /* Update the fsid? */ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) && !nfs_fsid_equal(&server->fsid, &fattr->fsid) && !IS_AUTOMOUNT(inode)) server->fsid = fattr->fsid; /* Save the delegation state before clearing cache_validity */ have_delegation = nfs_have_delegated_attributes(inode); /* * Update the read time so we don't revalidate too often. */ nfsi->read_cache_jiffies = fattr->time_start; save_cache_validity = nfsi->cache_validity; nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED | NFS_INO_REVAL_PAGECACHE | NFS_INO_INVALID_BLOCKS); /* Do atomic weak cache consistency updates */ nfs_wcc_update_inode(inode, fattr); if (pnfs_layoutcommit_outstanding(inode)) { nfsi->cache_validity |= save_cache_validity & NFS_INO_INVALID_ATTR; cache_revalidated = false; } /* More cache consistency checks */ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) { if (!inode_eq_iversion_raw(inode, fattr->change_attr)) { /* Could it be a race with writeback? */ if (!(have_writers || have_delegation)) { invalid |= NFS_INO_INVALID_DATA | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | NFS_INO_INVALID_XATTR; /* Force revalidate of all attributes */ save_cache_validity |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | NFS_INO_INVALID_OTHER; if (S_ISDIR(inode->i_mode)) nfs_force_lookup_revalidate(inode); dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); } else if (!have_delegation) nfsi->cache_validity |= NFS_INO_DATA_INVAL_DEFER; inode_set_iversion_raw(inode, fattr->change_attr); attr_changed = true; } } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CHANGE | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_MTIME) { inode->i_mtime = fattr->mtime; } else if (server->caps & NFS_CAP_MTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_MTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_CTIME) { inode->i_ctime = fattr->ctime; } else if (server->caps & NFS_CAP_CTIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_CTIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } /* Check if our cached file size is stale */ if (fattr->valid & NFS_ATTR_FATTR_SIZE) { new_isize = nfs_size_to_loff_t(fattr->size); cur_isize = i_size_read(inode); if (new_isize != cur_isize && !have_delegation) { /* Do we perhaps have any outstanding writes, or has * the file grown beyond our last write? */ if (!nfs_have_writebacks(inode) || new_isize > cur_isize) { i_size_write(inode, new_isize); if (!have_writers) invalid |= NFS_INO_INVALID_DATA; attr_changed = true; } dprintk("NFS: isize change on server for file %s/%ld " "(%Ld to %Ld)\n", inode->i_sb->s_id, inode->i_ino, (long long)cur_isize, (long long)new_isize); } } else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_SIZE | NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; else if (server->caps & NFS_CAP_ATIME) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_ATIME | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { umode_t newmode = inode->i_mode & S_IFMT; newmode |= fattr->mode & S_IALLUGO; inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; attr_changed = true; } } else if (server->caps & NFS_CAP_MODE) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_OWNER) { if (!uid_eq(inode->i_uid, fattr->uid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_uid = fattr->uid; attr_changed = true; } } else if (server->caps & NFS_CAP_OWNER) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_GROUP) { if (!gid_eq(inode->i_gid, fattr->gid)) { invalid |= NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL; inode->i_gid = fattr->gid; attr_changed = true; } } else if (server->caps & NFS_CAP_OWNER_GROUP) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_NLINK) { if (inode->i_nlink != fattr->nlink) { if (S_ISDIR(inode->i_mode)) invalid |= NFS_INO_INVALID_DATA; set_nlink(inode, fattr->nlink); attr_changed = true; } } else if (server->caps & NFS_CAP_NLINK) { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); cache_revalidated = false; } if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); } else if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; else { nfsi->cache_validity |= save_cache_validity & (NFS_INO_INVALID_BLOCKS | NFS_INO_REVAL_FORCED); cache_revalidated = false; } /* Update attrtimeo value if we're out of the unstable period */ if (attr_changed) { invalid &= ~NFS_INO_INVALID_ATTR; nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; /* Set barrier to be more recent than all outstanding updates */ nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (cache_revalidated) { if (!time_in_range_open(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { nfsi->attrtimeo <<= 1; if (nfsi->attrtimeo > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); } nfsi->attrtimeo_timestamp = now; } /* Set the barrier to be more recent than this fattr */ if ((long)(fattr->gencount - nfsi->attr_gencount) > 0) nfsi->attr_gencount = fattr->gencount; } /* Don't invalidate the data if we were to blame */ if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) invalid &= ~NFS_INO_INVALID_DATA; nfs_set_cache_invalid(inode, invalid); return 0; out_err: /* * No need to worry about unhashing the dentry, as the * lookup validation will know that the inode is bad. * (But we fall through to invalidate the caches.) */ nfs_set_inode_stale_locked(inode); return -ESTALE; } struct inode *nfs_alloc_inode(struct super_block *sb) { struct nfs_inode *nfsi; nfsi = kmem_cache_alloc(nfs_inode_cachep, GFP_KERNEL); if (!nfsi) return NULL; nfsi->flags = 0UL; nfsi->cache_validity = 0UL; #if IS_ENABLED(CONFIG_NFS_V4) nfsi->nfs4_acl = NULL; #endif /* CONFIG_NFS_V4 */ #ifdef CONFIG_NFS_V4_2 nfsi->xattr_cache = NULL; #endif return &nfsi->vfs_inode; } EXPORT_SYMBOL_GPL(nfs_alloc_inode); void nfs_free_inode(struct inode *inode) { kmem_cache_free(nfs_inode_cachep, NFS_I(inode)); } EXPORT_SYMBOL_GPL(nfs_free_inode); static inline void nfs4_init_once(struct nfs_inode *nfsi) { #if IS_ENABLED(CONFIG_NFS_V4) INIT_LIST_HEAD(&nfsi->open_states); nfsi->delegation = NULL; init_rwsem(&nfsi->rwsem); nfsi->layout = NULL; #endif } static void init_once(void *foo) { struct nfs_inode *nfsi = (struct nfs_inode *) foo; inode_init_once(&nfsi->vfs_inode); INIT_LIST_HEAD(&nfsi->open_files); INIT_LIST_HEAD(&nfsi->access_cache_entry_lru); INIT_LIST_HEAD(&nfsi->access_cache_inode_lru); INIT_LIST_HEAD(&nfsi->commit_info.list); atomic_long_set(&nfsi->nrequests, 0); atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); init_rwsem(&nfsi->rmdir_sem); mutex_init(&nfsi->commit_mutex); nfs4_init_once(nfsi); nfsi->cache_change_attribute = 0; } static int __init nfs_init_inodecache(void) { nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; return 0; } static void nfs_destroy_inodecache(void) { /* * Make sure all delayed rcu free inodes are flushed before we * destroy cache. */ rcu_barrier(); kmem_cache_destroy(nfs_inode_cachep); } struct workqueue_struct *nfsiod_workqueue; EXPORT_SYMBOL_GPL(nfsiod_workqueue); /* * start up the nfsiod workqueue */ static int nfsiod_start(void) { struct workqueue_struct *wq; dprintk("RPC: creating workqueue nfsiod\n"); wq = alloc_workqueue("nfsiod", WQ_MEM_RECLAIM | WQ_UNBOUND, 0); if (wq == NULL) return -ENOMEM; nfsiod_workqueue = wq; return 0; } /* * Destroy the nfsiod workqueue */ static void nfsiod_stop(void) { struct workqueue_struct *wq; wq = nfsiod_workqueue; if (wq == NULL) return; nfsiod_workqueue = NULL; destroy_workqueue(wq); } unsigned int nfs_net_id; EXPORT_SYMBOL_GPL(nfs_net_id); static int nfs_net_init(struct net *net) { nfs_clients_init(net); return nfs_fs_proc_net_init(net); } static void nfs_net_exit(struct net *net) { nfs_fs_proc_net_exit(net); nfs_clients_exit(net); } static struct pernet_operations nfs_net_ops = { .init = nfs_net_init, .exit = nfs_net_exit, .id = &nfs_net_id, .size = sizeof(struct nfs_net), }; /* * Initialize NFS */ static int __init init_nfs_fs(void) { int err; err = nfs_sysfs_init(); if (err < 0) goto out10; err = register_pernet_subsys(&nfs_net_ops); if (err < 0) goto out9; err = nfs_fscache_register(); if (err < 0) goto out8; err = nfsiod_start(); if (err) goto out7; err = nfs_fs_proc_init(); if (err) goto out6; err = nfs_init_nfspagecache(); if (err) goto out5; err = nfs_init_inodecache(); if (err) goto out4; err = nfs_init_readpagecache(); if (err) goto out3; err = nfs_init_writepagecache(); if (err) goto out2; err = nfs_init_directcache(); if (err) goto out1; rpc_proc_register(&init_net, &nfs_rpcstat); err = register_nfs_fs(); if (err) goto out0; return 0; out0: rpc_proc_unregister(&init_net, "nfs"); nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); out2: nfs_destroy_readpagecache(); out3: nfs_destroy_inodecache(); out4: nfs_destroy_nfspagecache(); out5: nfs_fs_proc_exit(); out6: nfsiod_stop(); out7: nfs_fscache_unregister(); out8: unregister_pernet_subsys(&nfs_net_ops); out9: nfs_sysfs_exit(); out10: return err; } static void __exit exit_nfs_fs(void) { nfs_destroy_directcache(); nfs_destroy_writepagecache(); nfs_destroy_readpagecache(); nfs_destroy_inodecache(); nfs_destroy_nfspagecache(); nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); rpc_proc_unregister(&init_net, "nfs"); unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); nfs_sysfs_exit(); } /* Not quite true; I just maintain it */ MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); MODULE_LICENSE("GPL"); module_param(enable_ino64, bool, 0644); module_init(init_nfs_fs) module_exit(exit_nfs_fs)
12 12 12 12 12 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 /* SPDX-License-Identifier: GPL-2.0+ */ /* * Read-Copy Update mechanism for mutual exclusion * * Copyright IBM Corporation, 2001 * * Author: Dipankar Sarma <dipankar@in.ibm.com> * * Based on the original work by Paul McKenney <paulmck@vnet.ibm.com> * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. * Papers: * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) * * For detailed explanation of Read-Copy Update mechanism see - * http://lse.sourceforge.net/locking/rcupdate.html * */ #ifndef __LINUX_RCUPDATE_H #define __LINUX_RCUPDATE_H #include <linux/types.h> #include <linux/compiler.h> #include <linux/atomic.h> #include <linux/irqflags.h> #include <linux/preempt.h> #include <linux/bottom_half.h> #include <linux/lockdep.h> #include <asm/processor.h> #include <linux/cpumask.h> #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) #define ulong2long(a) (*(long *)(&(a))) #define USHORT_CMP_GE(a, b) (USHRT_MAX / 2 >= (unsigned short)((a) - (b))) #define USHORT_CMP_LT(a, b) (USHRT_MAX / 2 < (unsigned short)((a) - (b))) /* Exported common interfaces */ void call_rcu(struct rcu_head *head, rcu_callback_t func); void rcu_barrier_tasks(void); void rcu_barrier_tasks_rude(void); void synchronize_rcu(void); #ifdef CONFIG_PREEMPT_RCU void __rcu_read_lock(void); void __rcu_read_unlock(void); /* * Defined as a macro as it is a very low level header included from * areas that don't even know about current. This gives the rcu_read_lock() * nesting depth, but makes sense only if CONFIG_PREEMPT_RCU -- in other * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() (current->rcu_read_lock_nesting) #else /* #ifdef CONFIG_PREEMPT_RCU */ #ifdef CONFIG_TINY_RCU #define rcu_read_unlock_strict() do { } while (0) #else void rcu_read_unlock_strict(void); #endif static inline void __rcu_read_lock(void) { preempt_disable(); } static inline void __rcu_read_unlock(void) { preempt_enable(); rcu_read_unlock_strict(); } static inline int rcu_preempt_depth(void) { return 0; } #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ void rcu_init(void); extern int rcu_scheduler_active __read_mostly; void rcu_sched_clock_irq(int user); void rcu_report_dead(unsigned int cpu); void rcutree_migrate_callbacks(int cpu); #ifdef CONFIG_TASKS_RCU_GENERIC void rcu_init_tasks_generic(void); #else static inline void rcu_init_tasks_generic(void) { } #endif #ifdef CONFIG_RCU_STALL_COMMON void rcu_sysrq_start(void); void rcu_sysrq_end(void); #else /* #ifdef CONFIG_RCU_STALL_COMMON */ static inline void rcu_sysrq_start(void) { } static inline void rcu_sysrq_end(void) { } #endif /* #else #ifdef CONFIG_RCU_STALL_COMMON */ #ifdef CONFIG_NO_HZ_FULL void rcu_user_enter(void); void rcu_user_exit(void); #else static inline void rcu_user_enter(void) { } static inline void rcu_user_exit(void) { } #endif /* CONFIG_NO_HZ_FULL */ #ifdef CONFIG_RCU_NOCB_CPU void rcu_init_nohz(void); void rcu_nocb_flush_deferred_wakeup(void); #else /* #ifdef CONFIG_RCU_NOCB_CPU */ static inline void rcu_init_nohz(void) { } static inline void rcu_nocb_flush_deferred_wakeup(void) { } #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ /** * RCU_NONIDLE - Indicate idle-loop code that needs RCU readers * @a: Code that RCU needs to pay attention to. * * RCU read-side critical sections are forbidden in the inner idle loop, * that is, between the rcu_idle_enter() and the rcu_idle_exit() -- RCU * will happily ignore any such read-side critical sections. However, * things like powertop need tracepoints in the inner idle loop. * * This macro provides the way out: RCU_NONIDLE(do_something_with_RCU()) * will tell RCU that it needs to pay attention, invoke its argument * (in this example, calling the do_something_with_RCU() function), * and then tell RCU to go back to ignoring this CPU. It is permissible * to nest RCU_NONIDLE() wrappers, but not indefinitely (but the limit is * on the order of a million or so, even on 32-bit systems). It is * not legal to block within RCU_NONIDLE(), nor is it permissible to * transfer control either into or out of RCU_NONIDLE()'s statement. */ #define RCU_NONIDLE(a) \ do { \ rcu_irq_enter_irqson(); \ do { a; } while (0); \ rcu_irq_exit_irqson(); \ } while (0) /* * Note a quasi-voluntary context switch for RCU-tasks's benefit. * This is a macro rather than an inline function to avoid #include hell. */ #ifdef CONFIG_TASKS_RCU_GENERIC # ifdef CONFIG_TASKS_RCU # define rcu_tasks_classic_qs(t, preempt) \ do { \ if (!(preempt) && READ_ONCE((t)->rcu_tasks_holdout)) \ WRITE_ONCE((t)->rcu_tasks_holdout, false); \ } while (0) void call_rcu_tasks(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks(void); # else # define rcu_tasks_classic_qs(t, preempt) do { } while (0) # define call_rcu_tasks call_rcu # define synchronize_rcu_tasks synchronize_rcu # endif # ifdef CONFIG_TASKS_TRACE_RCU # define rcu_tasks_trace_qs(t) \ do { \ if (!likely(READ_ONCE((t)->trc_reader_checked)) && \ !unlikely(READ_ONCE((t)->trc_reader_nesting))) { \ smp_store_release(&(t)->trc_reader_checked, true); \ smp_mb(); /* Readers partitioned by store. */ \ } \ } while (0) # else # define rcu_tasks_trace_qs(t) do { } while (0) # endif #define rcu_tasks_qs(t, preempt) \ do { \ rcu_tasks_classic_qs((t), (preempt)); \ rcu_tasks_trace_qs((t)); \ } while (0) # ifdef CONFIG_TASKS_RUDE_RCU void call_rcu_tasks_rude(struct rcu_head *head, rcu_callback_t func); void synchronize_rcu_tasks_rude(void); # endif #define rcu_note_voluntary_context_switch(t) rcu_tasks_qs(t, false) void exit_tasks_rcu_start(void); void exit_tasks_rcu_finish(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ #define rcu_tasks_qs(t, preempt) do { } while (0) #define rcu_note_voluntary_context_switch(t) do { } while (0) #define call_rcu_tasks call_rcu #define synchronize_rcu_tasks synchronize_rcu static inline void exit_tasks_rcu_start(void) { } static inline void exit_tasks_rcu_finish(void) { } #endif /* #else #ifdef CONFIG_TASKS_RCU_GENERIC */ /** * cond_resched_tasks_rcu_qs - Report potential quiescent states to RCU * * This macro resembles cond_resched(), except that it is defined to * report potential quiescent states to RCU-tasks even if the cond_resched() * machinery were to be shut off, as some advocate for PREEMPTION kernels. */ #define cond_resched_tasks_rcu_qs() \ do { \ rcu_tasks_qs(current, false); \ cond_resched(); \ } while (0) /* * Infrastructure to implement the synchronize_() primitives in * TREE_RCU and rcu_barrier_() primitives in TINY_RCU. */ #if defined(CONFIG_TREE_RCU) #include <linux/rcutree.h> #elif defined(CONFIG_TINY_RCU) #include <linux/rcutiny.h> #else #error "Unknown RCU implementation specified to kernel configuration" #endif /* * The init_rcu_head_on_stack() and destroy_rcu_head_on_stack() calls * are needed for dynamic initialization and destruction of rcu_head * on the stack, and init_rcu_head()/destroy_rcu_head() are needed for * dynamic initialization and destruction of statically allocated rcu_head * structures. However, rcu_head structures allocated dynamically in the * heap don't need any initialization. */ #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD void init_rcu_head(struct rcu_head *head); void destroy_rcu_head(struct rcu_head *head); void init_rcu_head_on_stack(struct rcu_head *head); void destroy_rcu_head_on_stack(struct rcu_head *head); #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ static inline void init_rcu_head(struct rcu_head *head) { } static inline void destroy_rcu_head(struct rcu_head *head) { } static inline void init_rcu_head_on_stack(struct rcu_head *head) { } static inline void destroy_rcu_head_on_stack(struct rcu_head *head) { } #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) bool rcu_lockdep_current_cpu_online(void); #else /* #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ static inline bool rcu_lockdep_current_cpu_online(void) { return true; } #endif /* #else #if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PROVE_RCU) */ #ifdef CONFIG_DEBUG_LOCK_ALLOC static inline void rcu_lock_acquire(struct lockdep_map *map) { lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_); } static inline void rcu_lock_release(struct lockdep_map *map) { lock_release(map, _THIS_IP_); } extern struct lockdep_map rcu_lock_map; extern struct lockdep_map rcu_bh_lock_map; extern struct lockdep_map rcu_sched_lock_map; extern struct lockdep_map rcu_callback_map; int debug_lockdep_rcu_enabled(void); int rcu_read_lock_held(void); int rcu_read_lock_bh_held(void); int rcu_read_lock_sched_held(void); int rcu_read_lock_any_held(void); #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ # define rcu_lock_acquire(a) do { } while (0) # define rcu_lock_release(a) do { } while (0) static inline int rcu_read_lock_held(void) { return 1; } static inline int rcu_read_lock_bh_held(void) { return 1; } static inline int rcu_read_lock_sched_held(void) { return !preemptible(); } static inline int rcu_read_lock_any_held(void) { return !preemptible(); } #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ #ifdef CONFIG_PROVE_RCU /** * RCU_LOCKDEP_WARN - emit lockdep splat if specified condition is met * @c: condition to check * @s: informative message */ #define RCU_LOCKDEP_WARN(c, s) \ do { \ static bool __section(".data.unlikely") __warned; \ if ((c) && debug_lockdep_rcu_enabled() && !__warned) { \ __warned = true; \ lockdep_rcu_suspicious(__FILE__, __LINE__, s); \ } \ } while (0) #if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU) static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } #else /* #ifdef CONFIG_PROVE_RCU */ static inline void rcu_preempt_sleep_check(void) { } #endif /* #else #ifdef CONFIG_PROVE_RCU */ #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ } while (0) #else /* #ifdef CONFIG_PROVE_RCU */ #define RCU_LOCKDEP_WARN(c, s) do { } while (0) #define rcu_sleep_check() do { } while (0) #endif /* #else #ifdef CONFIG_PROVE_RCU */ /* * Helper functions for rcu_dereference_check(), rcu_dereference_protected() * and rcu_assign_pointer(). Some of these could be folded into their * callers, but they are left separate in order to ease introduction of * multiple pointers markings to match different RCU implementations * (e.g., __srcu), should this make sense in the future. */ #ifdef __CHECKER__ #define rcu_check_sparse(p, space) \ ((void)(((typeof(*p) space *)p) == p)) #else /* #ifdef __CHECKER__ */ #define rcu_check_sparse(p, space) #endif /* #else #ifdef __CHECKER__ */ #define __rcu_access_pointer(p, space) \ ({ \ typeof(*p) *_________p1 = (typeof(*p) *__force)READ_ONCE(p); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(_________p1)); \ }) #define __rcu_dereference_check(p, c, space) \ ({ \ /* Dependency order vs. p above. */ \ typeof(*p) *________p1 = (typeof(*p) *__force)READ_ONCE(p); \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_check() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) #define __rcu_dereference_protected(p, c, space) \ ({ \ RCU_LOCKDEP_WARN(!(c), "suspicious rcu_dereference_protected() usage"); \ rcu_check_sparse(p, space); \ ((typeof(*p) __force __kernel *)(p)); \ }) #define rcu_dereference_raw(p) \ ({ \ /* Dependency order vs. p above. */ \ typeof(p) ________p1 = READ_ONCE(p); \ ((typeof(*p) __force __kernel *)(________p1)); \ }) /** * RCU_INITIALIZER() - statically initialize an RCU-protected global variable * @v: The value to statically initialize with. */ #define RCU_INITIALIZER(v) (typeof(*(v)) __force __rcu *)(v) /** * rcu_assign_pointer() - assign to RCU-protected pointer * @p: pointer to assign to * @v: value to assign (publish) * * Assigns the specified value to the specified RCU-protected * pointer, ensuring that any concurrent RCU readers will see * any prior initialization. * * Inserts memory barriers on architectures that require them * (which is most of them), and also prevents the compiler from * reordering the code that initializes the structure after the pointer * assignment. More importantly, this call documents which pointers * will be dereferenced by RCU read-side code. * * In some special cases, you may use RCU_INIT_POINTER() instead * of rcu_assign_pointer(). RCU_INIT_POINTER() is a bit faster due * to the fact that it does not constrain either the CPU or the compiler. * That said, using RCU_INIT_POINTER() when you should have used * rcu_assign_pointer() is a very bad thing that results in * impossible-to-diagnose memory corruption. So please be careful. * See the RCU_INIT_POINTER() comment header for details. * * Note that rcu_assign_pointer() evaluates each of its arguments only * once, appearances notwithstanding. One of the "extra" evaluations * is in typeof() and the other visible only to sparse (__CHECKER__), * neither of which actually execute the argument. As with most cpp * macros, this execute-arguments-only-once property is important, so * please be careful when making changes to rcu_assign_pointer() and the * other macros that it invokes. */ #define rcu_assign_pointer(p, v) \ do { \ uintptr_t _r_a_p__v = (uintptr_t)(v); \ rcu_check_sparse(p, __rcu); \ \ if (__builtin_constant_p(v) && (_r_a_p__v) == (uintptr_t)NULL) \ WRITE_ONCE((p), (typeof(p))(_r_a_p__v)); \ else \ smp_store_release(&p, RCU_INITIALIZER((typeof(p))_r_a_p__v)); \ } while (0) /** * rcu_replace_pointer() - replace an RCU pointer, returning its old value * @rcu_ptr: RCU pointer, whose old value is returned * @ptr: regular pointer * @c: the lockdep conditions under which the dereference will take place * * Perform a replacement, where @rcu_ptr is an RCU-annotated * pointer and @c is the lockdep argument that is passed to the * rcu_dereference_protected() call used to read that pointer. The old * value of @rcu_ptr is returned, and @rcu_ptr is set to @ptr. */ #define rcu_replace_pointer(rcu_ptr, ptr, c) \ ({ \ typeof(ptr) __tmp = rcu_dereference_protected((rcu_ptr), (c)); \ rcu_assign_pointer((rcu_ptr), (ptr)); \ __tmp; \ }) /** * rcu_access_pointer() - fetch RCU pointer with no dereferencing * @p: The pointer to read * * Return the value of the specified RCU-protected pointer, but omit the * lockdep checks for being in an RCU read-side critical section. This is * useful when the value of this pointer is accessed, but the pointer is * not dereferenced, for example, when testing an RCU-protected pointer * against NULL. Although rcu_access_pointer() may also be used in cases * where update-side locks prevent the value of the pointer from changing, * you should instead use rcu_dereference_protected() for this use case. * * It is also permissible to use rcu_access_pointer() when read-side * access to the pointer was removed at least one grace period ago, as * is the case in the context of the RCU callback that is freeing up * the data, or after a synchronize_rcu() returns. This can be useful * when tearing down multi-linked structures after a grace period * has elapsed. */ #define rcu_access_pointer(p) __rcu_access_pointer((p), __rcu) /** * rcu_dereference_check() - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Do an rcu_dereference(), but check that the conditions under which the * dereference will take place are correct. Typically the conditions * indicate the various locking conditions that should be held at that * point. The check should return true if the conditions are satisfied. * An implicit check for being in an RCU read-side critical section * (rcu_read_lock()) is included. * * For example: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock)); * * could be used to indicate to lockdep that foo->bar may only be dereferenced * if either rcu_read_lock() is held, or that the lock required to replace * the bar struct at foo->bar is held. * * Note that the list of conditions may also include indications of when a lock * need not be held, for example during initialisation or destruction of the * target struct: * * bar = rcu_dereference_check(foo->bar, lockdep_is_held(&foo->lock) || * atomic_read(&foo->usage) == 0); * * Inserts memory barriers on architectures that require them * (currently only the Alpha), prevents the compiler from refetching * (and from merging fetches), and, more importantly, documents exactly * which pointers are protected by RCU and checks that the pointer is * annotated as __rcu. */ #define rcu_dereference_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu) /** * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-bh counterpart to rcu_dereference_check(). */ #define rcu_dereference_bh_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu) /** * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * This is the RCU-sched counterpart to rcu_dereference_check(). */ #define rcu_dereference_sched_check(p, c) \ __rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \ __rcu) /* * The tracing infrastructure traces RCU (we want that), but unfortunately * some of the RCU checks causes tracing to lock up the system. * * The no-tracing version of rcu_dereference_raw() must not call * rcu_read_lock_held(). */ #define rcu_dereference_raw_check(p) __rcu_dereference_check((p), 1, __rcu) /** * rcu_dereference_protected() - fetch RCU pointer when updates prevented * @p: The pointer to read, prior to dereferencing * @c: The conditions under which the dereference will take place * * Return the value of the specified RCU-protected pointer, but omit * the READ_ONCE(). This is useful in cases where update-side locks * prevent the value of the pointer from changing. Please note that this * primitive does *not* prevent the compiler from repeating this reference * or combining it with other references, so it should not be used without * protection of appropriate locks. * * This function is only for update-side use. Using this function * when protected only by rcu_read_lock() will result in infrequent * but very ugly failures. */ #define rcu_dereference_protected(p, c) \ __rcu_dereference_protected((p), (c), __rcu) /** * rcu_dereference() - fetch RCU-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * This is a simple wrapper around rcu_dereference_check(). */ #define rcu_dereference(p) rcu_dereference_check(p, 0) /** * rcu_dereference_bh() - fetch an RCU-bh-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_bh(p) rcu_dereference_bh_check(p, 0) /** * rcu_dereference_sched() - fetch RCU-sched-protected pointer for dereferencing * @p: The pointer to read, prior to dereferencing * * Makes rcu_dereference_check() do the dirty work. */ #define rcu_dereference_sched(p) rcu_dereference_sched_check(p, 0) /** * rcu_pointer_handoff() - Hand off a pointer from RCU to other mechanism * @p: The pointer to hand off * * This is simply an identity function, but it documents where a pointer * is handed off from RCU to some other synchronization mechanism, for * example, reference counting or locking. In C11, it would map to * kill_dependency(). It could be used as follows:: * * rcu_read_lock(); * p = rcu_dereference(gp); * long_lived = is_long_lived(p); * if (long_lived) { * if (!atomic_inc_not_zero(p->refcnt)) * long_lived = false; * else * p = rcu_pointer_handoff(p); * } * rcu_read_unlock(); */ #define rcu_pointer_handoff(p) (p) /** * rcu_read_lock() - mark the beginning of an RCU read-side critical section * * When synchronize_rcu() is invoked on one CPU while other CPUs * are within RCU read-side critical sections, then the * synchronize_rcu() is guaranteed to block until after all the other * CPUs exit their critical sections. Similarly, if call_rcu() is invoked * on one CPU while other CPUs are within RCU read-side critical * sections, invocation of the corresponding RCU callback is deferred * until after the all the other CPUs exit their critical sections. * * Note, however, that RCU callbacks are permitted to run concurrently * with new RCU read-side critical sections. One way that this can happen * is via the following sequence of events: (1) CPU 0 enters an RCU * read-side critical section, (2) CPU 1 invokes call_rcu() to register * an RCU callback, (3) CPU 0 exits the RCU read-side critical section, * (4) CPU 2 enters a RCU read-side critical section, (5) the RCU * callback is invoked. This is legal, because the RCU read-side critical * section that was running concurrently with the call_rcu() (and which * therefore might be referencing something that the corresponding RCU * callback would free up) has completed before the corresponding * RCU callback is invoked. * * RCU read-side critical sections may be nested. Any deferred actions * will be deferred until the outermost RCU read-side critical section * completes. * * You can avoid reading and understanding the next paragraph by * following this rule: don't put anything in an rcu_read_lock() RCU * read-side critical section that would block in a !PREEMPTION kernel. * But if you want the full story, read on! * * In non-preemptible RCU implementations (pure TREE_RCU and TINY_RCU), * it is illegal to block while in an RCU read-side critical section. * In preemptible RCU implementations (PREEMPT_RCU) in CONFIG_PREEMPTION * kernel builds, RCU read-side critical sections may be preempted, * but explicit blocking is illegal. Finally, in preemptible RCU * implementations in real-time (with -rt patchset) kernel builds, RCU * read-side critical sections may be preempted and they may also block, but * only when acquiring spinlocks that are subject to priority inheritance. */ static __always_inline void rcu_read_lock(void) { __rcu_read_lock(); __acquire(RCU); rcu_lock_acquire(&rcu_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock() used illegally while idle"); } /* * So where is rcu_write_lock()? It does not exist, as there is no * way for writers to lock out RCU readers. This is a feature, not * a bug -- this property is what provides RCU's performance benefits. * Of course, writers must coordinate with each other. The normal * spinlock primitives work well for this, but any other technique may be * used as well. RCU does not care how the writers keep out of each * others' way, as long as they do so. */ /** * rcu_read_unlock() - marks the end of an RCU read-side critical section. * * In most situations, rcu_read_unlock() is immune from deadlock. * However, in kernels built with CONFIG_RCU_BOOST, rcu_read_unlock() * is responsible for deboosting, which it does via rt_mutex_unlock(). * Unfortunately, this function acquires the scheduler's runqueue and * priority-inheritance spinlocks. This means that deadlock could result * if the caller of rcu_read_unlock() already holds one of these locks or * any lock that is ever acquired while holding them. * * That said, RCU readers are never priority boosted unless they were * preempted. Therefore, one way to avoid deadlock is to make sure * that preemption never happens within any RCU read-side critical * section whose outermost rcu_read_unlock() is called with one of * rt_mutex_unlock()'s locks held. Such preemption can be avoided in * a number of ways, for example, by invoking preempt_disable() before * critical section's outermost rcu_read_lock(). * * Given that the set of locks acquired by rt_mutex_unlock() might change * at any time, a somewhat more future-proofed approach is to make sure * that that preemption never happens within any RCU read-side critical * section whose outermost rcu_read_unlock() is called with irqs disabled. * This approach relies on the fact that rt_mutex_unlock() currently only * acquires irq-disabled locks. * * The second of these two approaches is best in most situations, * however, the first approach can also be useful, at least to those * developers willing to keep abreast of the set of locks acquired by * rt_mutex_unlock(). * * See rcu_read_lock() for more information. */ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); __release(RCU); __rcu_read_unlock(); rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ } /** * rcu_read_lock_bh() - mark the beginning of an RCU-bh critical section * * This is equivalent of rcu_read_lock(), but also disables softirqs. * Note that anything else that disables softirqs can also serve as * an RCU read-side critical section. * * Note that rcu_read_lock_bh() and the matching rcu_read_unlock_bh() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_bh() from one task if the matching rcu_read_lock_bh() * was invoked from some other task. */ static inline void rcu_read_lock_bh(void) { local_bh_disable(); __acquire(RCU_BH); rcu_lock_acquire(&rcu_bh_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_bh() used illegally while idle"); } /** * rcu_read_unlock_bh() - marks the end of a softirq-only RCU critical section * * See rcu_read_lock_bh() for more information. */ static inline void rcu_read_unlock_bh(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_bh() used illegally while idle"); rcu_lock_release(&rcu_bh_lock_map); __release(RCU_BH); local_bh_enable(); } /** * rcu_read_lock_sched() - mark the beginning of a RCU-sched critical section * * This is equivalent of rcu_read_lock(), but disables preemption. * Read-side critical sections can also be introduced by anything else * that disables preemption, including local_irq_disable() and friends. * * Note that rcu_read_lock_sched() and the matching rcu_read_unlock_sched() * must occur in the same context, for example, it is illegal to invoke * rcu_read_unlock_sched() from process context if the matching * rcu_read_lock_sched() was invoked from an NMI handler. */ static inline void rcu_read_lock_sched(void) { preempt_disable(); __acquire(RCU_SCHED); rcu_lock_acquire(&rcu_sched_lock_map); RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_lock_sched() used illegally while idle"); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_lock_sched_notrace(void) { preempt_disable_notrace(); __acquire(RCU_SCHED); } /** * rcu_read_unlock_sched() - marks the end of a RCU-classic critical section * * See rcu_read_lock_sched() for more information. */ static inline void rcu_read_unlock_sched(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock_sched() used illegally while idle"); rcu_lock_release(&rcu_sched_lock_map); __release(RCU_SCHED); preempt_enable(); } /* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */ static inline notrace void rcu_read_unlock_sched_notrace(void) { __release(RCU_SCHED); preempt_enable_notrace(); } /** * RCU_INIT_POINTER() - initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * Initialize an RCU-protected pointer in special cases where readers * do not need ordering constraints on the CPU or the compiler. These * special cases are: * * 1. This use of RCU_INIT_POINTER() is NULLing out the pointer *or* * 2. The caller has taken whatever steps are required to prevent * RCU readers from concurrently accessing this pointer *or* * 3. The referenced data structure has already been exposed to * readers either at compile time or via rcu_assign_pointer() *and* * * a. You have not made *any* reader-visible changes to * this structure since then *or* * b. It is OK for readers accessing this structure from its * new location to see the old state of the structure. (For * example, the changes were to statistical counters or to * other state where exact synchronization is not required.) * * Failure to follow these rules governing use of RCU_INIT_POINTER() will * result in impossible-to-diagnose memory corruption. As in the structures * will look OK in crash dumps, but any concurrent RCU readers might * see pre-initialized values of the referenced data structure. So * please be very careful how you use RCU_INIT_POINTER()!!! * * If you are creating an RCU-protected linked structure that is accessed * by a single external-to-structure RCU-protected pointer, then you may * use RCU_INIT_POINTER() to initialize the internal RCU-protected * pointers, but you must use rcu_assign_pointer() to initialize the * external-to-structure pointer *after* you have completely initialized * the reader-accessible portions of the linked structure. * * Note that unlike rcu_assign_pointer(), RCU_INIT_POINTER() provides no * ordering guarantees for either the CPU or the compiler. */ #define RCU_INIT_POINTER(p, v) \ do { \ rcu_check_sparse(p, __rcu); \ WRITE_ONCE(p, RCU_INITIALIZER(v)); \ } while (0) /** * RCU_POINTER_INITIALIZER() - statically initialize an RCU protected pointer * @p: The pointer to be initialized. * @v: The value to initialized the pointer to. * * GCC-style initialization for an RCU-protected pointer in a structure field. */ #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) /* * Does the specified offset indicate that the corresponding rcu_head * structure can be handled by kvfree_rcu()? */ #define __is_kvfree_rcu_offset(offset) ((offset) < 4096) /* * Helper macro for kfree_rcu() to prevent argument-expansion eyestrain. */ #define __kvfree_rcu(head, offset) \ do { \ BUILD_BUG_ON(!__is_kvfree_rcu_offset(offset)); \ kvfree_call_rcu(head, (rcu_callback_t)(unsigned long)(offset)); \ } while (0) /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree * @rhf: the name of the struct rcu_head within the type of @ptr. * * Many rcu callbacks functions just call kfree() on the base structure. * These functions are trivial, but their size adds up, and furthermore * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * * The kfree_rcu() function handles this issue. Rather than encoding a * function address in the embedded rcu_head structure, kfree_rcu() instead * encodes the offset of the rcu_head structure within the base structure. * Because the functions are not allowed in the low-order 4096 bytes of * kernel virtual memory, offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in __kvfree_rcu(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to * position the rcu_head structure into the first 4096 bytes. * * Note that the allowable offset might decrease in the future, for example, * to allow something like kmem_cache_free_rcu(). * * The BUILD_BUG_ON check must not involve any function calls, hence the * checks are done in macros here. */ #define kfree_rcu(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ if (___p) \ __kvfree_rcu(&((___p)->rhf), offsetof(typeof(*(ptr)), rhf)); \ } while (0) /** * kvfree_rcu() - kvfree an object after a grace period. * * This macro consists of one or two arguments and it is * based on whether an object is head-less or not. If it * has a head then a semantic stays the same as it used * to be before: * * kvfree_rcu(ptr, rhf); * * where @ptr is a pointer to kvfree(), @rhf is the name * of the rcu_head structure within the type of @ptr. * * When it comes to head-less variant, only one argument * is passed and that is just a pointer which has to be * freed after a grace period. Therefore the semantic is * * kvfree_rcu(ptr); * * where @ptr is a pointer to kvfree(). * * Please note, head-less way of freeing is permitted to * use from a context that has to follow might_sleep() * annotation. Otherwise, please switch and embed the * rcu_head structure within the type of @ptr. */ #define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME #define kvfree_rcu_arg_2(ptr, rhf) kfree_rcu(ptr, rhf) #define kvfree_rcu_arg_1(ptr) \ do { \ typeof(ptr) ___p = (ptr); \ \ if (___p) \ kvfree_call_rcu(NULL, (rcu_callback_t) (___p)); \ } while (0) /* * Place this after a lock-acquisition primitive to guarantee that * an UNLOCK+LOCK pair acts as a full barrier. This guarantee applies * if the UNLOCK and LOCK are executed by the same CPU or if the * UNLOCK and LOCK operate on the same lock variable. */ #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE #define smp_mb__after_unlock_lock() smp_mb() /* Full ordering for lock. */ #else /* #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_ARCH_WEAK_RELEASE_ACQUIRE */ /* Has the specified rcu_head structure been handed to call_rcu()? */ /** * rcu_head_init - Initialize rcu_head for rcu_head_after_call_rcu() * @rhp: The rcu_head structure to initialize. * * If you intend to invoke rcu_head_after_call_rcu() to test whether a * given rcu_head structure has already been passed to call_rcu(), then * you must also invoke this rcu_head_init() function on it just after * allocating that structure. Calls to this function must not race with * calls to call_rcu(), rcu_head_after_call_rcu(), or callback invocation. */ static inline void rcu_head_init(struct rcu_head *rhp) { rhp->func = (rcu_callback_t)~0L; } /** * rcu_head_after_call_rcu() - Has this rcu_head been passed to call_rcu()? * @rhp: The rcu_head structure to test. * @f: The function passed to call_rcu() along with @rhp. * * Returns @true if the @rhp has been passed to call_rcu() with @func, * and @false otherwise. Emits a warning in any other case, including * the case where @rhp has already been invoked after a grace period. * Calls to this function must not race with callback invocation. One way * to avoid such races is to enclose the call to rcu_head_after_call_rcu() * in an RCU read-side critical section that includes a read-side fetch * of the pointer to the structure containing @rhp. */ static inline bool rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) { rcu_callback_t func = READ_ONCE(rhp->func); if (func == f) return true; WARN_ON_ONCE(func != (rcu_callback_t)~0L); return false; } /* kernel/ksysfs.c definitions */ extern int rcu_expedited; extern int rcu_normal; #endif /* __LINUX_RCUPDATE_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 /* SPDX-License-Identifier: GPL-2.0-only */ /* * Fast and scalable bitmaps. * * Copyright (C) 2016 Facebook * Copyright (C) 2013-2014 Jens Axboe */ #ifndef __LINUX_SCALE_BITMAP_H #define __LINUX_SCALE_BITMAP_H #include <linux/kernel.h> #include <linux/slab.h> struct seq_file; /** * struct sbitmap_word - Word in a &struct sbitmap. */ struct sbitmap_word { /** * @depth: Number of bits being used in @word/@cleared */ unsigned long depth; /** * @word: word holding free bits */ unsigned long word ____cacheline_aligned_in_smp; /** * @cleared: word holding cleared bits */ unsigned long cleared ____cacheline_aligned_in_smp; /** * @swap_lock: Held while swapping word <-> cleared */ spinlock_t swap_lock; } ____cacheline_aligned_in_smp; /** * struct sbitmap - Scalable bitmap. * * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This * trades off higher memory usage for better scalability. */ struct sbitmap { /** * @depth: Number of bits used in the whole bitmap. */ unsigned int depth; /** * @shift: log2(number of bits used per word) */ unsigned int shift; /** * @map_nr: Number of words (cachelines) being used for the bitmap. */ unsigned int map_nr; /** * @map: Allocated bitmap. */ struct sbitmap_word *map; }; #define SBQ_WAIT_QUEUES 8 #define SBQ_WAKE_BATCH 8 /** * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. */ struct sbq_wait_state { /** * @wait_cnt: Number of frees remaining before we wake up. */ atomic_t wait_cnt; /** * @wait: Wait queue. */ wait_queue_head_t wait; } ____cacheline_aligned_in_smp; /** * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free * bits. * * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to * avoid contention on the wait queue spinlock. This ensures that we don't hit a * scalability wall when we run out of free bits and have to start putting tasks * to sleep. */ struct sbitmap_queue { /** * @sb: Scalable bitmap. */ struct sbitmap sb; /* * @alloc_hint: Cache of last successfully allocated or freed bit. * * This is per-cpu, which allows multiple users to stick to different * cachelines until the map is exhausted. */ unsigned int __percpu *alloc_hint; /** * @wake_batch: Number of bits which must be freed before we wake up any * waiters. */ unsigned int wake_batch; /** * @wake_index: Next wait queue in @ws to wake up. */ atomic_t wake_index; /** * @ws: Wait queues. */ struct sbq_wait_state *ws; /* * @ws_active: count of currently active ws waitqueues */ atomic_t ws_active; /** * @round_robin: Allocate bits in strict round-robin order. */ bool round_robin; /** * @min_shallow_depth: The minimum shallow depth which may be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). */ unsigned int min_shallow_depth; }; /** * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. * @sb: Bitmap to initialize. * @depth: Number of bits to allocate. * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if * given, a good default is chosen. * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, gfp_t flags, int node); /** * sbitmap_free() - Free memory used by a &struct sbitmap. * @sb: Bitmap to free. */ static inline void sbitmap_free(struct sbitmap *sb) { kfree(sb->map); sb->map = NULL; } /** * sbitmap_resize() - Resize a &struct sbitmap. * @sb: Bitmap to resize. * @depth: New number of bits to resize to. * * Doesn't reallocate anything. It's up to the caller to ensure that the new * depth doesn't exceed the depth that the sb was initialized with. */ void sbitmap_resize(struct sbitmap *sb, unsigned int depth); /** * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. * @sb: Bitmap to allocate from. * @alloc_hint: Hint for where to start searching for a free bit. * @round_robin: If true, be stricter about allocation order; always allocate * starting from the last allocated bit. This is less efficient * than the default behavior (false). * * This operation provides acquire barrier semantics if it succeeds. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); /** * sbitmap_get_shallow() - Try to allocate a free bit from a &struct sbitmap, * limiting the depth used from each word. * @sb: Bitmap to allocate from. * @alloc_hint: Hint for where to start searching for a free bit. * @shallow_depth: The maximum number of bits to allocate from a single word. * * This rather specific operation allows for having multiple users with * different allocation limits. E.g., there can be a high-priority class that * uses sbitmap_get() and a low-priority class that uses sbitmap_get_shallow() * with a @shallow_depth of (1 << (@sb->shift - 1)). Then, the low-priority * class can only allocate half of the total bits in the bitmap, preventing it * from starving out the high-priority class. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int sbitmap_get_shallow(struct sbitmap *sb, unsigned int alloc_hint, unsigned long shallow_depth); /** * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. * @sb: Bitmap to check. * * Return: true if any bit in the bitmap is set, false otherwise. */ bool sbitmap_any_bit_set(const struct sbitmap *sb); #define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) #define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); /** * __sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @start: Where to start the iteration. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. * * This is inline even though it's non-trivial so that the function calls to the * callback will hopefully get optimized away. */ static inline void __sbitmap_for_each_set(struct sbitmap *sb, unsigned int start, sb_for_each_fn fn, void *data) { unsigned int index; unsigned int nr; unsigned int scanned = 0; if (start >= sb->depth) start = 0; index = SB_NR_TO_INDEX(sb, start); nr = SB_NR_TO_BIT(sb, start); while (scanned < sb->depth) { unsigned long word; unsigned int depth = min_t(unsigned int, sb->map[index].depth - nr, sb->depth - scanned); scanned += depth; word = sb->map[index].word & ~sb->map[index].cleared; if (!word) goto next; /* * On the first iteration of the outer loop, we need to add the * bit offset back to the size of the word for find_next_bit(). * On all other iterations, nr is zero, so this is a noop. */ depth += nr; while (1) { nr = find_next_bit(&word, depth, nr); if (nr >= depth) break; if (!fn(sb, (index << sb->shift) + nr, data)) return; nr++; } next: nr = 0; if (++index >= sb->map_nr) index = 0; } } /** * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. * @sb: Bitmap to iterate over. * @fn: Callback. Should return true to continue or false to break early. * @data: Pointer to pass to callback. */ static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, void *data) { __sbitmap_for_each_set(sb, 0, fn, data); } static inline unsigned long *__sbitmap_word(struct sbitmap *sb, unsigned int bitnr) { return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; } /* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) { set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) { clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /* * This one is special, since it doesn't actually clear the bit, rather it * sets the corresponding bit in the ->cleared mask instead. Paired with * the caller doing sbitmap_deferred_clear() if a given index is full, which * will clear the previously freed entries in the corresponding ->word. */ static inline void sbitmap_deferred_clear_bit(struct sbitmap *sb, unsigned int bitnr) { unsigned long *addr = &sb->map[SB_NR_TO_INDEX(sb, bitnr)].cleared; set_bit(SB_NR_TO_BIT(sb, bitnr), addr); } static inline void sbitmap_clear_bit_unlock(struct sbitmap *sb, unsigned int bitnr) { clear_bit_unlock(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) { return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); } /** * sbitmap_show() - Dump &struct sbitmap information to a &struct seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_bitmap_show() - Write a hex dump of a &struct sbitmap to a &struct * seq_file. * @sb: Bitmap to show. * @m: struct seq_file to write to. * * This is intended for debugging. The output isn't guaranteed to be internally * consistent. */ void sbitmap_bitmap_show(struct sbitmap *sb, struct seq_file *m); /** * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific * memory node. * @sbq: Bitmap queue to initialize. * @depth: See sbitmap_init_node(). * @shift: See sbitmap_init_node(). * @round_robin: See sbitmap_get(). * @flags: Allocation flags. * @node: Memory node to allocate on. * * Return: Zero on success or negative errno on failure. */ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, int shift, bool round_robin, gfp_t flags, int node); /** * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. * * @sbq: Bitmap queue to free. */ static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) { kfree(sbq->ws); free_percpu(sbq->alloc_hint); sbitmap_free(&sbq->sb); } /** * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. * @sbq: Bitmap queue to resize. * @depth: New number of bits to resize to. * * Like sbitmap_resize(), this doesn't reallocate anything. It has to do * some extra work on the &struct sbitmap_queue, so it's not safe to just * resize the underlying &struct sbitmap. */ void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); /** * __sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue with preemption already disabled. * @sbq: Bitmap queue to allocate from. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get(struct sbitmap_queue *sbq); /** * __sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word, with preemption * already disabled. * @sbq: Bitmap queue to allocate from. * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ int __sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int shallow_depth); /** * sbitmap_queue_get() - Try to allocate a free bit from a &struct * sbitmap_queue. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, unsigned int *cpu) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get(sbq); put_cpu(); return nr; } /** * sbitmap_queue_get_shallow() - Try to allocate a free bit from a &struct * sbitmap_queue, limiting the depth used from each word. * @sbq: Bitmap queue to allocate from. * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to * sbitmap_queue_clear()). * @shallow_depth: The maximum number of bits to allocate from a single word. * See sbitmap_get_shallow(). * * If you call this, make sure to call sbitmap_queue_min_shallow_depth() after * initializing @sbq. * * Return: Non-negative allocated bit number if successful, -1 otherwise. */ static inline int sbitmap_queue_get_shallow(struct sbitmap_queue *sbq, unsigned int *cpu, unsigned int shallow_depth) { int nr; *cpu = get_cpu(); nr = __sbitmap_queue_get_shallow(sbq, shallow_depth); put_cpu(); return nr; } /** * sbitmap_queue_min_shallow_depth() - Inform a &struct sbitmap_queue of the * minimum shallow depth that will be used. * @sbq: Bitmap queue in question. * @min_shallow_depth: The minimum shallow depth that will be passed to * sbitmap_queue_get_shallow() or __sbitmap_queue_get_shallow(). * * sbitmap_queue_clear() batches wakeups as an optimization. The batch size * depends on the depth of the bitmap. Since the shallow allocation functions * effectively operate with a different depth, the shallow depth must be taken * into account when calculating the batch size. This function must be called * with the minimum shallow depth that will be used. Failure to do so can result * in missed wakeups. */ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq, unsigned int min_shallow_depth); /** * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a * &struct sbitmap_queue. * @sbq: Bitmap to free from. * @nr: Bit number to free. * @cpu: CPU the bit was allocated on. */ void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, unsigned int cpu); static inline int sbq_index_inc(int index) { return (index + 1) & (SBQ_WAIT_QUEUES - 1); } static inline void sbq_index_atomic_inc(atomic_t *index) { int old = atomic_read(index); int new = sbq_index_inc(old); atomic_cmpxchg(index, old, new); } /** * sbq_wait_ptr() - Get the next wait queue to use for a &struct * sbitmap_queue. * @sbq: Bitmap queue to wait on. * @wait_index: A counter per "user" of @sbq. */ static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, atomic_t *wait_index) { struct sbq_wait_state *ws; ws = &sbq->ws[atomic_read(wait_index)]; sbq_index_atomic_inc(wait_index); return ws; } /** * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct * sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); /** * sbitmap_queue_wake_up() - Wake up some of waiters in one waitqueue * on a &struct sbitmap_queue. * @sbq: Bitmap queue to wake up. */ void sbitmap_queue_wake_up(struct sbitmap_queue *sbq); /** * sbitmap_queue_show() - Dump &struct sbitmap_queue information to a &struct * seq_file. * @sbq: Bitmap queue to show. * @m: struct seq_file to write to. * * This is intended for debugging. The format may change at any time. */ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m); struct sbq_wait { struct sbitmap_queue *sbq; /* if set, sbq_wait is accounted */ struct wait_queue_entry wait; }; #define DEFINE_SBQ_WAIT(name) \ struct sbq_wait name = { \ .sbq = NULL, \ .wait = { \ .private = current, \ .func = autoremove_wake_function, \ .entry = LIST_HEAD_INIT((name).wait.entry), \ } \ } /* * Wrapper around prepare_to_wait_exclusive(), which maintains some extra * internal state. */ void sbitmap_prepare_to_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait, int state); /* * Must be paired with sbitmap_prepare_to_wait(). */ void sbitmap_finish_wait(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Wrapper around add_wait_queue(), which maintains some extra internal state */ void sbitmap_add_wait_queue(struct sbitmap_queue *sbq, struct sbq_wait_state *ws, struct sbq_wait *sbq_wait); /* * Must be paired with sbitmap_add_wait_queue() */ void sbitmap_del_wait_queue(struct sbq_wait *sbq_wait); #endif /* __LINUX_SCALE_BITMAP_H */
3 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 // SPDX-License-Identifier: GPL-2.0-only /* * A generic implementation of binary search for the Linux kernel * * Copyright (C) 2008-2009 Ksplice, Inc. * Author: Tim Abbott <tabbott@ksplice.com> */ #include <linux/export.h> #include <linux/bsearch.h> #include <linux/kprobes.h> /* * bsearch - binary search an array of elements * @key: pointer to item being searched for * @base: pointer to first element to search * @num: number of elements * @size: size of each element * @cmp: pointer to comparison function * * This function does a binary search on the given array. The * contents of the array should already be in ascending sorted order * under the provided comparison function. * * Note that the key need not have the same type as the elements in * the array, e.g. key could be a string and the comparison function * could compare the string with the struct's name field. However, if * the key and elements in the array are of the same type, you can use * the same comparison function for both sort() and bsearch(). */ void *bsearch(const void *key, const void *base, size_t num, size_t size, cmp_func_t cmp) { return __inline_bsearch(key, base, num, size, cmp); } EXPORT_SYMBOL(bsearch); NOKPROBE_SYMBOL(bsearch);
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 /* SPDX-License-Identifier: GPL-2.0 */ #ifndef LINUX_CRASH_DUMP_H #define LINUX_CRASH_DUMP_H #include <linux/kexec.h> #include <linux/proc_fs.h> #include <linux/elf.h> #include <linux/pgtable.h> #include <uapi/linux/vmcore.h> #include <linux/pgtable.h> /* for pgprot_t */ #ifdef CONFIG_CRASH_DUMP #define ELFCORE_ADDR_MAX (-1ULL) #define ELFCORE_ADDR_ERR (-2ULL) extern unsigned long long elfcorehdr_addr; extern unsigned long long elfcorehdr_size; extern int elfcorehdr_alloc(unsigned long long *addr, unsigned long long *size); extern void elfcorehdr_free(unsigned long long addr); extern ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos); extern ssize_t elfcorehdr_read_notes(char *buf, size_t count, u64 *ppos); extern int remap_oldmem_pfn_range(struct vm_area_struct *vma, unsigned long from, unsigned long pfn, unsigned long size, pgprot_t prot); extern ssize_t copy_oldmem_page(unsigned long, char *, size_t, unsigned long, int); extern ssize_t copy_oldmem_page_encrypted(unsigned long pfn, char *buf, size_t csize, unsigned long offset, int userbuf); void vmcore_cleanup(void); /* Architecture code defines this if there are other possible ELF * machine types, e.g. on bi-arch capable hardware. */ #ifndef vmcore_elf_check_arch_cross #define vmcore_elf_check_arch_cross(x) 0 #endif /* * Architecture code can redefine this if there are any special checks * needed for 32-bit ELF or 64-bit ELF vmcores. In case of 32-bit * only architecture, vmcore_elf64_check_arch can be set to zero. */ #ifndef vmcore_elf32_check_arch #define vmcore_elf32_check_arch(x) elf_check_arch(x) #endif #ifndef vmcore_elf64_check_arch #define vmcore_elf64_check_arch(x) (elf_check_arch(x) || vmcore_elf_check_arch_cross(x)) #endif /* * is_kdump_kernel() checks whether this kernel is booting after a panic of * previous kernel or not. This is determined by checking if previous kernel * has passed the elf core header address on command line. * * This is not just a test if CONFIG_CRASH_DUMP is enabled or not. It will * return true if CONFIG_CRASH_DUMP=y and if kernel is booting after a panic * of previous kernel. */ static inline bool is_kdump_kernel(void) { return elfcorehdr_addr != ELFCORE_ADDR_MAX; } /* is_vmcore_usable() checks if the kernel is booting after a panic and * the vmcore region is usable. * * This makes use of the fact that due to alignment -2ULL is not * a valid pointer, much in the vain of IS_ERR(), except * dealing directly with an unsigned long long rather than a pointer. */ static inline int is_vmcore_usable(void) { return is_kdump_kernel() && elfcorehdr_addr != ELFCORE_ADDR_ERR ? 1 : 0; } /* vmcore_unusable() marks the vmcore as unusable, * without disturbing the logic of is_kdump_kernel() */ static inline void vmcore_unusable(void) { if (is_kdump_kernel()) elfcorehdr_addr = ELFCORE_ADDR_ERR; } #define HAVE_OLDMEM_PFN_IS_RAM 1 extern int register_oldmem_pfn_is_ram(int (*fn)(unsigned long pfn)); extern void unregister_oldmem_pfn_is_ram(void); #else /* !CONFIG_CRASH_DUMP */ static inline bool is_kdump_kernel(void) { return 0; } #endif /* CONFIG_CRASH_DUMP */ /* Device Dump information to be filled by drivers */ struct vmcoredd_data { char dump_name[VMCOREDD_MAX_NAME_BYTES]; /* Unique name of the dump */ unsigned int size; /* Size of the dump */ /* Driver's registered callback to be invoked to collect dump */ int (*vmcoredd_callback)(struct vmcoredd_data *data, void *buf); }; #ifdef CONFIG_PROC_VMCORE_DEVICE_DUMP int vmcore_add_device_dump(struct vmcoredd_data *data); #else static inline int vmcore_add_device_dump(struct vmcoredd_data *data) { return -EOPNOTSUPP; } #endif /* CONFIG_PROC_VMCORE_DEVICE_DUMP */ #ifdef CONFIG_PROC_VMCORE ssize_t read_from_oldmem(char *buf, size_t count, u64 *ppos, int userbuf, bool encrypted); #else static inline ssize_t read_from_oldmem(char *buf, size_t count, u64 *ppos, int userbuf, bool encrypted) { return -EOPNOTSUPP; } #endif /* CONFIG_PROC_VMCORE */ #endif /* LINUX_CRASHDUMP_H */
3 5 3 3 3 3 3 3 5 5 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 /* SPDX-License-Identifier: GPL-2.0 */ /* * Filesystem access notification for Linux * * Copyright (C) 2008 Red Hat, Inc., Eric Paris <eparis@redhat.com> */ #ifndef __LINUX_FSNOTIFY_BACKEND_H #define __LINUX_FSNOTIFY_BACKEND_H #ifdef __KERNEL__ #include <linux/idr.h> /* inotify uses this */ #include <linux/fs.h> /* struct inode */ #include <linux/list.h> #include <linux/path.h> /* struct path */ #include <linux/spinlock.h> #include <linux/types.h> #include <linux/atomic.h> #include <linux/user_namespace.h> #include <linux/refcount.h> /* * IN_* from inotfy.h lines up EXACTLY with FS_*, this is so we can easily * convert between them. dnotify only needs conversion at watch creation * so no perf loss there. fanotify isn't defined yet, so it can use the * wholes if it needs more events. */ #define FS_ACCESS 0x00000001 /* File was accessed */ #define FS_MODIFY 0x00000002 /* File was modified */ #define FS_ATTRIB 0x00000004 /* Metadata changed */ #define FS_CLOSE_WRITE 0x00000008 /* Writtable file was closed */ #define FS_CLOSE_NOWRITE 0x00000010 /* Unwrittable file closed */ #define FS_OPEN 0x00000020 /* File was opened */ #define FS_MOVED_FROM 0x00000040 /* File was moved from X */ #define FS_MOVED_TO 0x00000080 /* File was moved to Y */ #define FS_CREATE 0x00000100 /* Subfile was created */ #define FS_DELETE 0x00000200 /* Subfile was deleted */ #define FS_DELETE_SELF 0x00000400 /* Self was deleted */ #define FS_MOVE_SELF 0x00000800 /* Self was moved */ #define FS_OPEN_EXEC 0x00001000 /* File was opened for exec */ #define FS_UNMOUNT 0x00002000 /* inode on umount fs */ #define FS_Q_OVERFLOW 0x00004000 /* Event queued overflowed */ #define FS_IN_IGNORED 0x00008000 /* last inotify event here */ #define FS_OPEN_PERM 0x00010000 /* open event in an permission hook */ #define FS_ACCESS_PERM 0x00020000 /* access event in a permissions hook */ #define FS_OPEN_EXEC_PERM 0x00040000 /* open/exec event in a permission hook */ #define FS_EXCL_UNLINK 0x04000000 /* do not send events if object is unlinked */ /* * Set on inode mark that cares about things that happen to its children. * Always set for dnotify and inotify. * Set on inode/sb/mount marks that care about parent/name info. */ #define FS_EVENT_ON_CHILD 0x08000000 #define FS_DN_RENAME 0x10000000 /* file renamed */ #define FS_DN_MULTISHOT 0x20000000 /* dnotify multishot */ #define FS_ISDIR 0x40000000 /* event occurred against dir */ #define FS_IN_ONESHOT 0x80000000 /* only send event once */ #define FS_MOVE (FS_MOVED_FROM | FS_MOVED_TO) /* * Directory entry modification events - reported only to directory * where entry is modified and not to a watching parent. * The watching parent may get an FS_ATTRIB|FS_EVENT_ON_CHILD event * when a directory entry inside a child subdir changes. */ #define ALL_FSNOTIFY_DIRENT_EVENTS (FS_CREATE | FS_DELETE | FS_MOVE) #define ALL_FSNOTIFY_PERM_EVENTS (FS_OPEN_PERM | FS_ACCESS_PERM | \ FS_OPEN_EXEC_PERM) /* * This is a list of all events that may get sent to a parent that is watching * with flag FS_EVENT_ON_CHILD based on fs event on a child of that directory. */ #define FS_EVENTS_POSS_ON_CHILD (ALL_FSNOTIFY_PERM_EVENTS | \ FS_ACCESS | FS_MODIFY | FS_ATTRIB | \ FS_CLOSE_WRITE | FS_CLOSE_NOWRITE | \ FS_OPEN | FS_OPEN_EXEC) /* * This is a list of all events that may get sent with the parent inode as the * @to_tell argument of fsnotify(). * It may include events that can be sent to an inode/sb/mount mark, but cannot * be sent to a parent watching children. */ #define FS_EVENTS_POSS_TO_PARENT (FS_EVENTS_POSS_ON_CHILD) /* Events that can be reported to backends */ #define ALL_FSNOTIFY_EVENTS (ALL_FSNOTIFY_DIRENT_EVENTS | \ FS_EVENTS_POSS_ON_CHILD | \ FS_DELETE_SELF | FS_MOVE_SELF | FS_DN_RENAME | \ FS_UNMOUNT | FS_Q_OVERFLOW | FS_IN_IGNORED) /* Extra flags that may be reported with event or control handling of events */ #define ALL_FSNOTIFY_FLAGS (FS_EXCL_UNLINK | FS_ISDIR | FS_IN_ONESHOT | \ FS_DN_MULTISHOT | FS_EVENT_ON_CHILD) #define ALL_FSNOTIFY_BITS (ALL_FSNOTIFY_EVENTS | ALL_FSNOTIFY_FLAGS) struct fsnotify_group; struct fsnotify_event; struct fsnotify_mark; struct fsnotify_event_private_data; struct fsnotify_fname; struct fsnotify_iter_info; struct mem_cgroup; /* * Each group much define these ops. The fsnotify infrastructure will call * these operations for each relevant group. * * handle_event - main call for a group to handle an fs event * @group: group to notify * @mask: event type and flags * @data: object that event happened on * @data_type: type of object for fanotify_data_XXX() accessors * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to * @file_name: optional file name associated with event * @cookie: inotify rename cookie * @iter_info: array of marks from this group that are interested in the event * * handle_inode_event - simple variant of handle_event() for groups that only * have inode marks and don't have ignore mask * @mark: mark to notify * @mask: event type and flags * @inode: inode that event happened on * @dir: optional directory associated with event - * if @file_name is not NULL, this is the directory that * @file_name is relative to. * @file_name: optional file name associated with event * @cookie: inotify rename cookie * * free_group_priv - called when a group refcnt hits 0 to clean up the private union * freeing_mark - called when a mark is being destroyed for some reason. The group * MUST be holding a reference on each mark and that reference must be * dropped in this function. inotify uses this function to send * userspace messages that marks have been removed. */ struct fsnotify_ops { int (*handle_event)(struct fsnotify_group *group, u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *file_name, u32 cookie, struct fsnotify_iter_info *iter_info); int (*handle_inode_event)(struct fsnotify_mark *mark, u32 mask, struct inode *inode, struct inode *dir, const struct qstr *file_name, u32 cookie); void (*free_group_priv)(struct fsnotify_group *group); void (*freeing_mark)(struct fsnotify_mark *mark, struct fsnotify_group *group); void (*free_event)(struct fsnotify_event *event); /* called on final put+free to free memory */ void (*free_mark)(struct fsnotify_mark *mark); }; /* * all of the information about the original object we want to now send to * a group. If you want to carry more info from the accessing task to the * listener this structure is where you need to be adding fields. */ struct fsnotify_event { struct list_head list; unsigned long objectid; /* identifier for queue merges */ }; /* * A group is a "thing" that wants to receive notification about filesystem * events. The mask holds the subset of event types this group cares about. * refcnt on a group is up to the implementor and at any moment if it goes 0 * everything will be cleaned up. */ struct fsnotify_group { const struct fsnotify_ops *ops; /* how this group handles things */ /* * How the refcnt is used is up to each group. When the refcnt hits 0 * fsnotify will clean up all of the resources associated with this group. * As an example, the dnotify group will always have a refcnt=1 and that * will never change. Inotify, on the other hand, has a group per * inotify_init() and the refcnt will hit 0 only when that fd has been * closed. */ refcount_t refcnt; /* things with interest in this group */ /* needed to send notification to userspace */ spinlock_t notification_lock; /* protect the notification_list */ struct list_head notification_list; /* list of event_holder this group needs to send to userspace */ wait_queue_head_t notification_waitq; /* read() on the notification file blocks on this waitq */ unsigned int q_len; /* events on the queue */ unsigned int max_events; /* maximum events allowed on the list */ /* * Valid fsnotify group priorities. Events are send in order from highest * priority to lowest priority. We default to the lowest priority. */ #define FS_PRIO_0 0 /* normal notifiers, no permissions */ #define FS_PRIO_1 1 /* fanotify content based access control */ #define FS_PRIO_2 2 /* fanotify pre-content access */ unsigned int priority; bool shutdown; /* group is being shut down, don't queue more events */ /* stores all fastpath marks assoc with this group so they can be cleaned on unregister */ struct mutex mark_mutex; /* protect marks_list */ atomic_t num_marks; /* 1 for each mark and 1 for not being * past the point of no return when freeing * a group */ atomic_t user_waits; /* Number of tasks waiting for user * response */ struct list_head marks_list; /* all inode marks for this group */ struct fasync_struct *fsn_fa; /* async notification */ struct fsnotify_event *overflow_event; /* Event we queue when the * notification list is too * full */ struct mem_cgroup *memcg; /* memcg to charge allocations */ /* groups can define private fields here or use the void *private */ union { void *private; #ifdef CONFIG_INOTIFY_USER struct inotify_group_private_data { spinlock_t idr_lock; struct idr idr; struct ucounts *ucounts; } inotify_data; #endif #ifdef CONFIG_FANOTIFY struct fanotify_group_private_data { /* allows a group to block waiting for a userspace response */ struct list_head access_list; wait_queue_head_t access_waitq; int flags; /* flags from fanotify_init() */ int f_flags; /* event_f_flags from fanotify_init() */ unsigned int max_marks; struct user_struct *user; } fanotify_data; #endif /* CONFIG_FANOTIFY */ }; }; /* When calling fsnotify tell it if the data is a path or inode */ enum fsnotify_data_type { FSNOTIFY_EVENT_NONE, FSNOTIFY_EVENT_PATH, FSNOTIFY_EVENT_INODE, }; static inline struct inode *fsnotify_data_inode(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_INODE: return (struct inode *)data; case FSNOTIFY_EVENT_PATH: return d_inode(((const struct path *)data)->dentry); default: return NULL; } } static inline const struct path *fsnotify_data_path(const void *data, int data_type) { switch (data_type) { case FSNOTIFY_EVENT_PATH: return data; default: return NULL; } } enum fsnotify_obj_type { FSNOTIFY_OBJ_TYPE_INODE, FSNOTIFY_OBJ_TYPE_PARENT, FSNOTIFY_OBJ_TYPE_VFSMOUNT, FSNOTIFY_OBJ_TYPE_SB, FSNOTIFY_OBJ_TYPE_COUNT, FSNOTIFY_OBJ_TYPE_DETACHED = FSNOTIFY_OBJ_TYPE_COUNT }; #define FSNOTIFY_OBJ_TYPE_INODE_FL (1U << FSNOTIFY_OBJ_TYPE_INODE) #define FSNOTIFY_OBJ_TYPE_PARENT_FL (1U << FSNOTIFY_OBJ_TYPE_PARENT) #define FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL (1U << FSNOTIFY_OBJ_TYPE_VFSMOUNT) #define FSNOTIFY_OBJ_TYPE_SB_FL (1U << FSNOTIFY_OBJ_TYPE_SB) #define FSNOTIFY_OBJ_ALL_TYPES_MASK ((1U << FSNOTIFY_OBJ_TYPE_COUNT) - 1) static inline bool fsnotify_valid_obj_type(unsigned int type) { return (type < FSNOTIFY_OBJ_TYPE_COUNT); } struct fsnotify_iter_info { struct fsnotify_mark *marks[FSNOTIFY_OBJ_TYPE_COUNT]; unsigned int report_mask; int srcu_idx; }; static inline bool fsnotify_iter_should_report_type( struct fsnotify_iter_info *iter_info, int type) { return (iter_info->report_mask & (1U << type)); } static inline void fsnotify_iter_set_report_type( struct fsnotify_iter_info *iter_info, int type) { iter_info->report_mask |= (1U << type); } static inline void fsnotify_iter_set_report_type_mark( struct fsnotify_iter_info *iter_info, int type, struct fsnotify_mark *mark) { iter_info->marks[type] = mark; iter_info->report_mask |= (1U << type); } #define FSNOTIFY_ITER_FUNCS(name, NAME) \ static inline struct fsnotify_mark *fsnotify_iter_##name##_mark( \ struct fsnotify_iter_info *iter_info) \ { \ return (iter_info->report_mask & FSNOTIFY_OBJ_TYPE_##NAME##_FL) ? \ iter_info->marks[FSNOTIFY_OBJ_TYPE_##NAME] : NULL; \ } FSNOTIFY_ITER_FUNCS(inode, INODE) FSNOTIFY_ITER_FUNCS(parent, PARENT) FSNOTIFY_ITER_FUNCS(vfsmount, VFSMOUNT) FSNOTIFY_ITER_FUNCS(sb, SB) #define fsnotify_foreach_obj_type(type) \ for (type = 0; type < FSNOTIFY_OBJ_TYPE_COUNT; type++) /* * fsnotify_connp_t is what we embed in objects which connector can be attached * to. fsnotify_connp_t * is how we refer from connector back to object. */ struct fsnotify_mark_connector; typedef struct fsnotify_mark_connector __rcu *fsnotify_connp_t; /* * Inode/vfsmount/sb point to this structure which tracks all marks attached to * the inode/vfsmount/sb. The reference to inode/vfsmount/sb is held by this * structure. We destroy this structure when there are no more marks attached * to it. The structure is protected by fsnotify_mark_srcu. */ struct fsnotify_mark_connector { spinlock_t lock; unsigned short type; /* Type of object [lock] */ #define FSNOTIFY_CONN_FLAG_HAS_FSID 0x01 unsigned short flags; /* flags [lock] */ __kernel_fsid_t fsid; /* fsid of filesystem containing object */ union { /* Object pointer [lock] */ fsnotify_connp_t *obj; /* Used listing heads to free after srcu period expires */ struct fsnotify_mark_connector *destroy_next; }; struct hlist_head list; }; /* * A mark is simply an object attached to an in core inode which allows an * fsnotify listener to indicate they are either no longer interested in events * of a type matching mask or only interested in those events. * * These are flushed when an inode is evicted from core and may be flushed * when the inode is modified (as seen by fsnotify_access). Some fsnotify * users (such as dnotify) will flush these when the open fd is closed and not * at inode eviction or modification. * * Text in brackets is showing the lock(s) protecting modifications of a * particular entry. obj_lock means either inode->i_lock or * mnt->mnt_root->d_lock depending on the mark type. */ struct fsnotify_mark { /* Mask this mark is for [mark->lock, group->mark_mutex] */ __u32 mask; /* We hold one for presence in g_list. Also one ref for each 'thing' * in kernel that found and may be using this mark. */ refcount_t refcnt; /* Group this mark is for. Set on mark creation, stable until last ref * is dropped */ struct fsnotify_group *group; /* List of marks by group->marks_list. Also reused for queueing * mark into destroy_list when it's waiting for the end of SRCU period * before it can be freed. [group->mark_mutex] */ struct list_head g_list; /* Protects inode / mnt pointers, flags, masks */ spinlock_t lock; /* List of marks for inode / vfsmount [connector->lock, mark ref] */ struct hlist_node obj_list; /* Head of list of marks for an object [mark ref] */ struct fsnotify_mark_connector *connector; /* Events types to ignore [mark->lock, group->mark_mutex] */ __u32 ignored_mask; #define FSNOTIFY_MARK_FLAG_IGNORED_SURV_MODIFY 0x01 #define FSNOTIFY_MARK_FLAG_ALIVE 0x02 #define FSNOTIFY_MARK_FLAG_ATTACHED 0x04 unsigned int flags; /* flags [mark->lock] */ }; #ifdef CONFIG_FSNOTIFY /* called from the vfs helpers */ /* main fsnotify call to send events */ extern int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie); extern int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type); extern void __fsnotify_inode_delete(struct inode *inode); extern void __fsnotify_vfsmount_delete(struct vfsmount *mnt); extern void fsnotify_sb_delete(struct super_block *sb); extern u32 fsnotify_get_cookie(void); static inline __u32 fsnotify_parent_needed_mask(__u32 mask) { /* FS_EVENT_ON_CHILD is set on marks that want parent/name info */ if (!(mask & FS_EVENT_ON_CHILD)) return 0; /* * This object might be watched by a mark that cares about parent/name * info, does it care about the specific set of events that can be * reported with parent/name info? */ return mask & FS_EVENTS_POSS_TO_PARENT; } static inline int fsnotify_inode_watches_children(struct inode *inode) { /* FS_EVENT_ON_CHILD is set if the inode may care */ if (!(inode->i_fsnotify_mask & FS_EVENT_ON_CHILD)) return 0; /* this inode might care about child events, does it care about the * specific set of events that can happen on a child? */ return inode->i_fsnotify_mask & FS_EVENTS_POSS_ON_CHILD; } /* * Update the dentry with a flag indicating the interest of its parent to receive * filesystem events when those events happens to this dentry->d_inode. */ static inline void fsnotify_update_flags(struct dentry *dentry) { assert_spin_locked(&dentry->d_lock); /* * Serialisation of setting PARENT_WATCHED on the dentries is provided * by d_lock. If inotify_inode_watched changes after we have taken * d_lock, the following __fsnotify_update_child_dentry_flags call will * find our entry, so it will spin until we complete here, and update * us with the new state. */ if (fsnotify_inode_watches_children(dentry->d_parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; else dentry->d_flags &= ~DCACHE_FSNOTIFY_PARENT_WATCHED; } /* called from fsnotify listeners, such as fanotify or dnotify */ /* create a new group */ extern struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops); /* get reference to a group */ extern void fsnotify_get_group(struct fsnotify_group *group); /* drop reference on a group from fsnotify_alloc_group */ extern void fsnotify_put_group(struct fsnotify_group *group); /* group destruction begins, stop queuing new events */ extern void fsnotify_group_stop_queueing(struct fsnotify_group *group); /* destroy group */ extern void fsnotify_destroy_group(struct fsnotify_group *group); /* fasync handler function */ extern int fsnotify_fasync(int fd, struct file *file, int on); /* Free event from memory */ extern void fsnotify_destroy_event(struct fsnotify_group *group, struct fsnotify_event *event); /* attach the event to the group notification queue */ extern int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct list_head *, struct fsnotify_event *)); /* Queue overflow event to a notification group */ static inline void fsnotify_queue_overflow(struct fsnotify_group *group) { fsnotify_add_event(group, group->overflow_event, NULL); } /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_peek_first_event(struct fsnotify_group *group); /* return AND dequeue the first event on the notification queue */ extern struct fsnotify_event *fsnotify_remove_first_event(struct fsnotify_group *group); /* Remove event queued in the notification list */ extern void fsnotify_remove_queued_event(struct fsnotify_group *group, struct fsnotify_event *event); /* functions used to manipulate the marks attached to inodes */ /* Get mask of events for a list of marks */ extern __u32 fsnotify_conn_mask(struct fsnotify_mark_connector *conn); /* Calculate mask of events for a list of marks */ extern void fsnotify_recalc_mask(struct fsnotify_mark_connector *conn); extern void fsnotify_init_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* Find mark belonging to given group in the list of marks */ extern struct fsnotify_mark *fsnotify_find_mark(fsnotify_connp_t *connp, struct fsnotify_group *group); /* Get cached fsid of filesystem containing object */ extern int fsnotify_get_conn_fsid(const struct fsnotify_mark_connector *conn, __kernel_fsid_t *fsid); /* attach the mark to the object */ extern int fsnotify_add_mark(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, int allow_dups, __kernel_fsid_t *fsid); extern int fsnotify_add_mark_locked(struct fsnotify_mark *mark, fsnotify_connp_t *connp, unsigned int type, int allow_dups, __kernel_fsid_t *fsid); /* attach the mark to the inode */ static inline int fsnotify_add_inode_mark(struct fsnotify_mark *mark, struct inode *inode, int allow_dups) { return fsnotify_add_mark(mark, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL); } static inline int fsnotify_add_inode_mark_locked(struct fsnotify_mark *mark, struct inode *inode, int allow_dups) { return fsnotify_add_mark_locked(mark, &inode->i_fsnotify_marks, FSNOTIFY_OBJ_TYPE_INODE, allow_dups, NULL); } /* given a group and a mark, flag mark to be freed when all references are dropped */ extern void fsnotify_destroy_mark(struct fsnotify_mark *mark, struct fsnotify_group *group); /* detach mark from inode / mount list, group list, drop inode reference */ extern void fsnotify_detach_mark(struct fsnotify_mark *mark); /* free mark */ extern void fsnotify_free_mark(struct fsnotify_mark *mark); /* Wait until all marks queued for destruction are destroyed */ extern void fsnotify_wait_marks_destroyed(void); /* run all the marks in a group, and clear all of the marks attached to given object type */ extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group, unsigned int type); /* run all the marks in a group, and clear all of the vfsmount marks */ static inline void fsnotify_clear_vfsmount_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_VFSMOUNT_FL); } /* run all the marks in a group, and clear all of the inode marks */ static inline void fsnotify_clear_inode_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_INODE_FL); } /* run all the marks in a group, and clear all of the sn marks */ static inline void fsnotify_clear_sb_marks_by_group(struct fsnotify_group *group) { fsnotify_clear_marks_by_group(group, FSNOTIFY_OBJ_TYPE_SB_FL); } extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); extern void fsnotify_finish_user_wait(struct fsnotify_iter_info *iter_info); extern bool fsnotify_prepare_user_wait(struct fsnotify_iter_info *iter_info); static inline void fsnotify_init_event(struct fsnotify_event *event, unsigned long objectid) { INIT_LIST_HEAD(&event->list); event->objectid = objectid; } #else static inline int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, const struct qstr *name, struct inode *inode, u32 cookie) { return 0; } static inline int __fsnotify_parent(struct dentry *dentry, __u32 mask, const void *data, int data_type) { return 0; } static inline void __fsnotify_inode_delete(struct inode *inode) {} static inline void __fsnotify_vfsmount_delete(struct vfsmount *mnt) {} static inline void fsnotify_sb_delete(struct super_block *sb) {} static inline void fsnotify_update_flags(struct dentry *dentry) {} static inline u32 fsnotify_get_cookie(void) { return 0; } static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ #endif /* __KERNEL __ */ #endif /* __LINUX_FSNOTIFY_BACKEND_H */
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 /* SPDX-License-Identifier: GPL-2.0-or-later */ /* * Framework and drivers for configuring and reading different PHYs * Based on code in sungem_phy.c and (long-removed) gianfar_phy.c * * Author: Andy Fleming * * Copyright (c) 2004 Freescale Semiconductor, Inc. */ #ifndef __PHY_H #define __PHY_H #include <linux/compiler.h> #include <linux/spinlock.h> #include <linux/ethtool.h> #include <linux/linkmode.h> #include <linux/netlink.h> #include <linux/mdio.h> #include <linux/mii.h> #include <linux/mii_timestamper.h> #include <linux/module.h> #include <linux/timer.h> #include <linux/workqueue.h> #include <linux/mod_devicetable.h> #include <linux/u64_stats_sync.h> #include <linux/irqreturn.h> #include <linux/iopoll.h> #include <linux/refcount.h> #include <linux/atomic.h> #define PHY_DEFAULT_FEATURES (SUPPORTED_Autoneg | \ SUPPORTED_TP | \ SUPPORTED_MII) #define PHY_10BT_FEATURES (SUPPORTED_10baseT_Half | \ SUPPORTED_10baseT_Full) #define PHY_100BT_FEATURES (SUPPORTED_100baseT_Half | \ SUPPORTED_100baseT_Full) #define PHY_1000BT_FEATURES (SUPPORTED_1000baseT_Half | \ SUPPORTED_1000baseT_Full) extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_basic_t1_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_fibre_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_gbit_all_ports_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_fec_features) __ro_after_init; extern __ETHTOOL_DECLARE_LINK_MODE_MASK(phy_10gbit_full_features) __ro_after_init; #define PHY_BASIC_FEATURES ((unsigned long *)&phy_basic_features) #define PHY_BASIC_T1_FEATURES ((unsigned long *)&phy_basic_t1_features) #define PHY_GBIT_FEATURES ((unsigned long *)&phy_gbit_features) #define PHY_GBIT_FIBRE_FEATURES ((unsigned long *)&phy_gbit_fibre_features) #define PHY_GBIT_ALL_PORTS_FEATURES ((unsigned long *)&phy_gbit_all_ports_features) #define PHY_10GBIT_FEATURES ((unsigned long *)&phy_10gbit_features) #define PHY_10GBIT_FEC_FEATURES ((unsigned long *)&phy_10gbit_fec_features) #define PHY_10GBIT_FULL_FEATURES ((unsigned long *)&phy_10gbit_full_features) extern const int phy_basic_ports_array[3]; extern const int phy_fibre_port_array[1]; extern const int phy_all_ports_features_array[7]; extern const int phy_10_100_features_array[4]; extern const int phy_basic_t1_features_array[2]; extern const int phy_gbit_features_array[2]; extern const int phy_10gbit_features_array[1]; /* * Set phydev->irq to PHY_POLL if interrupts are not supported, * or not desired for this PHY. Set to PHY_IGNORE_INTERRUPT if * the attached driver handles the interrupt */ #define PHY_POLL -1 #define PHY_IGNORE_INTERRUPT -2 #define PHY_IS_INTERNAL 0x00000001 #define PHY_RST_AFTER_CLK_EN 0x00000002 #define PHY_POLL_CABLE_TEST 0x00000004 #define MDIO_DEVICE_IS_PHY 0x80000000 /** * enum phy_interface_t - Interface Mode definitions * * @PHY_INTERFACE_MODE_NA: Not Applicable - don't touch * @PHY_INTERFACE_MODE_INTERNAL: No interface, MAC and PHY combined * @PHY_INTERFACE_MODE_MII: Median-independent interface * @PHY_INTERFACE_MODE_GMII: Gigabit median-independent interface * @PHY_INTERFACE_MODE_SGMII: Serial gigabit media-independent interface * @PHY_INTERFACE_MODE_TBI: Ten Bit Interface * @PHY_INTERFACE_MODE_REVMII: Reverse Media Independent Interface * @PHY_INTERFACE_MODE_RMII: Reduced Media Independent Interface * @PHY_INTERFACE_MODE_RGMII: Reduced gigabit media-independent interface * @PHY_INTERFACE_MODE_RGMII_ID: RGMII with Internal RX+TX delay * @PHY_INTERFACE_MODE_RGMII_RXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RGMII_TXID: RGMII with Internal RX delay * @PHY_INTERFACE_MODE_RTBI: Reduced TBI * @PHY_INTERFACE_MODE_SMII: ??? MII * @PHY_INTERFACE_MODE_XGMII: 10 gigabit media-independent interface * @PHY_INTERFACE_MODE_XLGMII:40 gigabit media-independent interface * @PHY_INTERFACE_MODE_MOCA: Multimedia over Coax * @PHY_INTERFACE_MODE_QSGMII: Quad SGMII * @PHY_INTERFACE_MODE_TRGMII: Turbo RGMII * @PHY_INTERFACE_MODE_1000BASEX: 1000 BaseX * @PHY_INTERFACE_MODE_2500BASEX: 2500 BaseX * @PHY_INTERFACE_MODE_RXAUI: Reduced XAUI * @PHY_INTERFACE_MODE_XAUI: 10 Gigabit Attachment Unit Interface * @PHY_INTERFACE_MODE_10GBASER: 10G BaseR * @PHY_INTERFACE_MODE_USXGMII: Universal Serial 10GE MII * @PHY_INTERFACE_MODE_10GKR: 10GBASE-KR - with Clause 73 AN * @PHY_INTERFACE_MODE_MAX: Book keeping * * Describes the interface between the MAC and PHY. */ typedef enum { PHY_INTERFACE_MODE_NA, PHY_INTERFACE_MODE_INTERNAL, PHY_INTERFACE_MODE_MII, PHY_INTERFACE_MODE_GMII, PHY_INTERFACE_MODE_SGMII, PHY_INTERFACE_MODE_TBI, PHY_INTERFACE_MODE_REVMII, PHY_INTERFACE_MODE_RMII, PHY_INTERFACE_MODE_RGMII, PHY_INTERFACE_MODE_RGMII_ID, PHY_INTERFACE_MODE_RGMII_RXID, PHY_INTERFACE_MODE_RGMII_TXID, PHY_INTERFACE_MODE_RTBI, PHY_INTERFACE_MODE_SMII, PHY_INTERFACE_MODE_XGMII, PHY_INTERFACE_MODE_XLGMII, PHY_INTERFACE_MODE_MOCA, PHY_INTERFACE_MODE_QSGMII, PHY_INTERFACE_MODE_TRGMII, PHY_INTERFACE_MODE_1000BASEX, PHY_INTERFACE_MODE_2500BASEX, PHY_INTERFACE_MODE_RXAUI, PHY_INTERFACE_MODE_XAUI, /* 10GBASE-R, XFI, SFI - single lane 10G Serdes */ PHY_INTERFACE_MODE_10GBASER, PHY_INTERFACE_MODE_USXGMII, /* 10GBASE-KR - with Clause 73 AN */ PHY_INTERFACE_MODE_10GKR, PHY_INTERFACE_MODE_MAX, } phy_interface_t; /* * phy_supported_speeds - return all speeds currently supported by a PHY device */ unsigned int phy_supported_speeds(struct phy_device *phy, unsigned int *speeds, unsigned int size); /** * phy_modes - map phy_interface_t enum to device tree binding of phy-mode * @interface: enum phy_interface_t value * * Description: maps enum &phy_interface_t defined in this file * into the device tree binding of 'phy-mode', so that Ethernet * device driver can get PHY interface from device tree. */ static inline const char *phy_modes(phy_interface_t interface) { switch (interface) { case PHY_INTERFACE_MODE_NA: return ""; case PHY_INTERFACE_MODE_INTERNAL: return "internal"; case PHY_INTERFACE_MODE_MII: return "mii"; case PHY_INTERFACE_MODE_GMII: return "gmii"; case PHY_INTERFACE_MODE_SGMII: return "sgmii"; case PHY_INTERFACE_MODE_TBI: return "tbi"; case PHY_INTERFACE_MODE_REVMII: return "rev-mii"; case PHY_INTERFACE_MODE_RMII: return "rmii"; case PHY_INTERFACE_MODE_RGMII: return "rgmii"; case PHY_INTERFACE_MODE_RGMII_ID: return "rgmii-id"; case PHY_INTERFACE_MODE_RGMII_RXID: return "rgmii-rxid"; case PHY_INTERFACE_MODE_RGMII_TXID: return "rgmii-txid"; case PHY_INTERFACE_MODE_RTBI: return "rtbi"; case PHY_INTERFACE_MODE_SMII: return "smii"; case PHY_INTERFACE_MODE_XGMII: return "xgmii"; case PHY_INTERFACE_MODE_XLGMII: return "xlgmii"; case PHY_INTERFACE_MODE_MOCA: return "moca"; case PHY_INTERFACE_MODE_QSGMII: return "qsgmii"; case PHY_INTERFACE_MODE_TRGMII: return "trgmii"; case PHY_INTERFACE_MODE_1000BASEX: return "1000base-x"; case PHY_INTERFACE_MODE_2500BASEX: return "2500base-x"; case PHY_INTERFACE_MODE_RXAUI: return "rxaui"; case PHY_INTERFACE_MODE_XAUI: return "xaui"; case PHY_INTERFACE_MODE_10GBASER: return "10gbase-r"; case PHY_INTERFACE_MODE_USXGMII: return "usxgmii"; case PHY_INTERFACE_MODE_10GKR: return "10gbase-kr"; default: return "unknown"; } } #define PHY_INIT_TIMEOUT 100000 #define PHY_FORCE_TIMEOUT 10 #define PHY_MAX_ADDR 32 /* Used when trying to connect to a specific phy (mii bus id:phy device id) */ #define PHY_ID_FMT "%s:%02x" #define MII_BUS_ID_SIZE 61 struct device; struct phylink; struct sfp_bus; struct sfp_upstream_ops; struct sk_buff; /** * struct mdio_bus_stats - Statistics counters for MDIO busses * @transfers: Total number of transfers, i.e. @writes + @reads * @errors: Number of MDIO transfers that returned an error * @writes: Number of write transfers * @reads: Number of read transfers * @syncp: Synchronisation for incrementing statistics */ struct mdio_bus_stats { u64_stats_t transfers; u64_stats_t errors; u64_stats_t writes; u64_stats_t reads; /* Must be last, add new statistics above */ struct u64_stats_sync syncp; }; /** * struct phy_package_shared - Shared information in PHY packages * @addr: Common PHY address used to combine PHYs in one package * @refcnt: Number of PHYs connected to this shared data * @flags: Initialization of PHY package * @priv_size: Size of the shared private data @priv * @priv: Driver private data shared across a PHY package * * Represents a shared structure between different phydev's in the same * package, for example a quad PHY. See phy_package_join() and * phy_package_leave(). */ struct phy_package_shared { int addr; refcount_t refcnt; unsigned long flags; size_t priv_size; /* private data pointer */ /* note that this pointer is shared between different phydevs and * the user has to take care of appropriate locking. It is allocated * and freed automatically by phy_package_join() and * phy_package_leave(). */ void *priv; }; /* used as bit number in atomic bitops */ #define PHY_SHARED_F_INIT_DONE 0 #define PHY_SHARED_F_PROBE_DONE 1 /** * struct mii_bus - Represents an MDIO bus * * @owner: Who owns this device * @name: User friendly name for this MDIO device, or driver name * @id: Unique identifier for this bus, typical from bus hierarchy * @priv: Driver private data * * The Bus class for PHYs. Devices which provide access to * PHYs should register using this structure */ struct mii_bus { struct module *owner; const char *name; char id[MII_BUS_ID_SIZE]; void *priv; /** @read: Perform a read transfer on the bus */ int (*read)(struct mii_bus *bus, int addr, int regnum); /** @write: Perform a write transfer on the bus */ int (*write)(struct mii_bus *bus, int addr, int regnum, u16 val); /** @reset: Perform a reset of the bus */ int (*reset)(struct mii_bus *bus); /** @stats: Statistic counters per device on the bus */ struct mdio_bus_stats stats[PHY_MAX_ADDR]; /** * @mdio_lock: A lock to ensure that only one thing can read/write * the MDIO bus at a time */ struct mutex mdio_lock; /** @parent: Parent device of this bus */ struct device *parent; /** @state: State of bus structure */ enum { MDIOBUS_ALLOCATED = 1, MDIOBUS_REGISTERED, MDIOBUS_UNREGISTERED, MDIOBUS_RELEASED, } state; /** @dev: Kernel device representation */ struct device dev; /** @mdio_map: list of all MDIO devices on bus */ struct mdio_device *mdio_map[PHY_MAX_ADDR]; /** @phy_mask: PHY addresses to be ignored when probing */ u32 phy_mask; /** @phy_ignore_ta_mask: PHY addresses to ignore the TA/read failure */ u32 phy_ignore_ta_mask; /** * @irq: An array of interrupts, each PHY's interrupt at the index * matching its address */ int irq[PHY_MAX_ADDR]; /** @reset_delay_us: GPIO reset pulse width in microseconds */ int reset_delay_us; /** @reset_post_delay_us: GPIO reset deassert delay in microseconds */ int reset_post_delay_us; /** @reset_gpiod: Reset GPIO descriptor pointer */ struct gpio_desc *reset_gpiod; /** @probe_capabilities: bus capabilities, used for probing */ enum { MDIOBUS_NO_CAP = 0, MDIOBUS_C22, MDIOBUS_C45, MDIOBUS_C22_C45, } probe_capabilities; /** @shared_lock: protect access to the shared element */ struct mutex shared_lock; /** @shared: shared state across different PHYs */ struct phy_package_shared *shared[PHY_MAX_ADDR]; }; #define to_mii_bus(d) container_of(d, struct mii_bus, dev) struct mii_bus *mdiobus_alloc_size(size_t size); /** * mdiobus_alloc - Allocate an MDIO bus structure * * The internal state of the MDIO bus will be set of MDIOBUS_ALLOCATED ready * for the driver to register the bus. */ static inline struct mii_bus *mdiobus_alloc(void) { return mdiobus_alloc_size(0); } int __mdiobus_register(struct mii_bus *bus, struct module *owner); int __devm_mdiobus_register(struct device *dev, struct mii_bus *bus, struct module *owner); #define mdiobus_register(bus) __mdiobus_register(bus, THIS_MODULE) #define devm_mdiobus_register(dev, bus) \ __devm_mdiobus_register(dev, bus, THIS_MODULE) void mdiobus_unregister(struct mii_bus *bus); void mdiobus_free(struct mii_bus *bus); struct mii_bus *devm_mdiobus_alloc_size(struct device *dev, int sizeof_priv); static inline struct mii_bus *devm_mdiobus_alloc(struct device *dev) { return devm_mdiobus_alloc_size(dev, 0); } struct mii_bus *mdio_find_bus(const char *mdio_name); struct phy_device *mdiobus_scan(struct mii_bus *bus, int addr); #define PHY_INTERRUPT_DISABLED false #define PHY_INTERRUPT_ENABLED true /** * enum phy_state - PHY state machine states: * * @PHY_DOWN: PHY device and driver are not ready for anything. probe * should be called if and only if the PHY is in this state, * given that the PHY device exists. * - PHY driver probe function will set the state to @PHY_READY * * @PHY_READY: PHY is ready to send and receive packets, but the * controller is not. By default, PHYs which do not implement * probe will be set to this state by phy_probe(). * - start will set the state to UP * * @PHY_UP: The PHY and attached device are ready to do work. * Interrupts should be started here. * - timer moves to @PHY_NOLINK or @PHY_RUNNING * * @PHY_NOLINK: PHY is up, but not currently plugged in. * - irq or timer will set @PHY_RUNNING if link comes back * - phy_stop moves to @PHY_HALTED * * @PHY_RUNNING: PHY is currently up, running, and possibly sending * and/or receiving packets * - irq or timer will set @PHY_NOLINK if link goes down * - phy_stop moves to @PHY_HALTED * * @PHY_CABLETEST: PHY is performing a cable test. Packet reception/sending * is not expected to work, carrier will be indicated as down. PHY will be * poll once per second, or on interrupt for it current state. * Once complete, move to UP to restart the PHY. * - phy_stop aborts the running test and moves to @PHY_HALTED * * @PHY_HALTED: PHY is up, but no polling or interrupts are done. Or * PHY is in an error state. * - phy_start moves to @PHY_UP */ enum phy_state { PHY_DOWN = 0, PHY_READY, PHY_HALTED, PHY_UP, PHY_RUNNING, PHY_NOLINK, PHY_CABLETEST, }; #define MDIO_MMD_NUM 32 /** * struct phy_c45_device_ids - 802.3-c45 Device Identifiers * @devices_in_package: IEEE 802.3 devices in package register value. * @mmds_present: bit vector of MMDs present. * @device_ids: The device identifer for each present device. */ struct phy_c45_device_ids { u32 devices_in_package; u32 mmds_present; u32 device_ids[MDIO_MMD_NUM]; }; struct macsec_context; struct macsec_ops; /** * struct phy_device - An instance of a PHY * * @mdio: MDIO bus this PHY is on * @drv: Pointer to the driver for this PHY instance * @phy_id: UID for this device found during discovery * @c45_ids: 802.3-c45 Device Identifiers if is_c45. * @is_c45: Set to true if this PHY uses clause 45 addressing. * @is_internal: Set to true if this PHY is internal to a MAC. * @is_pseudo_fixed_link: Set to true if this PHY is an Ethernet switch, etc. * @is_gigabit_capable: Set to true if PHY supports 1000Mbps * @has_fixups: Set to true if this PHY has fixups/quirks. * @suspended: Set to true if this PHY has been suspended successfully. * @suspended_by_mdio_bus: Set to true if this PHY was suspended by MDIO bus. * @sysfs_links: Internal boolean tracking sysfs symbolic links setup/removal. * @loopback_enabled: Set true if this PHY has been loopbacked successfully. * @downshifted_rate: Set true if link speed has been downshifted. * @state: State of the PHY for management purposes * @dev_flags: Device-specific flags used by the PHY driver. * @irq: IRQ number of the PHY's interrupt (-1 if none) * @phy_timer: The timer for handling the state machine * @phylink: Pointer to phylink instance for this PHY * @sfp_bus_attached: Flag indicating whether the SFP bus has been attached * @sfp_bus: SFP bus attached to this PHY's fiber port * @attached_dev: The attached enet driver's device instance ptr * @adjust_link: Callback for the enet controller to respond to changes: in the * link state. * @phy_link_change: Callback for phylink for notification of link change * @macsec_ops: MACsec offloading ops. * * @speed: Current link speed * @duplex: Current duplex * @port: Current port * @pause: Current pause * @asym_pause: Current asymmetric pause * @supported: Combined MAC/PHY supported linkmodes * @advertising: Currently advertised linkmodes * @adv_old: Saved advertised while power saving for WoL * @lp_advertising: Current link partner advertised linkmodes * @eee_broken_modes: Energy efficient ethernet modes which should be prohibited * @autoneg: Flag autoneg being used * @link: Current link state * @autoneg_complete: Flag auto negotiation of the link has completed * @mdix: Current crossover * @mdix_ctrl: User setting of crossover * @interrupts: Flag interrupts have been enabled * @interface: enum phy_interface_t value * @skb: Netlink message for cable diagnostics * @nest: Netlink nest used for cable diagnostics * @ehdr: nNtlink header for cable diagnostics * @phy_led_triggers: Array of LED triggers * @phy_num_led_triggers: Number of triggers in @phy_led_triggers * @led_link_trigger: LED trigger for link up/down * @last_triggered: last LED trigger for link speed * @master_slave_set: User requested master/slave configuration * @master_slave_get: Current master/slave advertisement * @master_slave_state: Current master/slave configuration * @mii_ts: Pointer to time stamper callbacks * @lock: Mutex for serialization access to PHY * @state_queue: Work queue for state machine * @shared: Pointer to private data shared by phys in one package * @priv: Pointer to driver private data * * interrupts currently only supports enabled or disabled, * but could be changed in the future to support enabling * and disabling specific interrupts * * Contains some infrastructure for polling and interrupt * handling, as well as handling shifts in PHY hardware state */ struct phy_device { struct mdio_device mdio; /* Information about the PHY type */ /* And management functions */ struct phy_driver *drv; u32 phy_id; struct phy_c45_device_ids c45_ids; unsigned is_c45:1; unsigned is_internal:1; unsigned is_pseudo_fixed_link:1; unsigned is_gigabit_capable:1; unsigned has_fixups:1; unsigned suspended:1; unsigned suspended_by_mdio_bus:1; unsigned sysfs_links:1; unsigned loopback_enabled:1; unsigned downshifted_rate:1; unsigned autoneg:1; /* The most recently read link state */ unsigned link:1; unsigned autoneg_complete:1; /* Interrupts are enabled */ unsigned interrupts:1; enum phy_state state; u32 dev_flags; phy_interface_t interface; /* * forced speed & duplex (no autoneg) * partner speed & duplex & pause (autoneg) */ int speed; int duplex; int port; int pause; int asym_pause; u8 master_slave_get; u8 master_slave_set; u8 master_slave_state; /* Union of PHY and Attached devices' supported link modes */ /* See ethtool.h for more info */ __ETHTOOL_DECLARE_LINK_MODE_MASK(supported); __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); __ETHTOOL_DECLARE_LINK_MODE_MASK(lp_advertising); /* used with phy_speed_down */ __ETHTOOL_DECLARE_LINK_MODE_MASK(adv_old); /* Energy efficient ethernet modes which should be prohibited */ u32 eee_broken_modes; #ifdef CONFIG_LED_TRIGGER_PHY struct phy_led_trigger *phy_led_triggers; unsigned int phy_num_led_triggers; struct phy_led_trigger *last_triggered; struct phy_led_trigger *led_link_trigger; #endif /* * Interrupt number for this PHY * -1 means no interrupt */ int irq; /* private data pointer */ /* For use by PHYs to maintain extra state */ void *priv; /* shared data pointer */ /* For use by PHYs inside the same package that need a shared state. */ struct phy_package_shared *shared; /* Reporting cable test results */ struct sk_buff *skb; void *ehdr; struct nlattr *nest; /* Interrupt and Polling infrastructure */ struct delayed_work state_queue; struct mutex lock; /* This may be modified under the rtnl lock */ bool sfp_bus_attached; struct sfp_bus *sfp_bus; struct phylink *phylink; struct net_device *attached_dev; struct mii_timestamper *mii_ts; u8 mdix; u8 mdix_ctrl; void (*phy_link_change)(struct phy_device *phydev, bool up); void (*adjust_link)(struct net_device *dev); #if IS_ENABLED(CONFIG_MACSEC) /* MACsec management functions */ const struct macsec_ops *macsec_ops; #endif }; #define to_phy_device(d) container_of(to_mdio_device(d), \ struct phy_device, mdio) /** * struct phy_tdr_config - Configuration of a TDR raw test * * @first: Distance for first data collection point * @last: Distance for last data collection point * @step: Step between data collection points * @pair: Bitmap of cable pairs to collect data for * * A structure containing possible configuration parameters * for a TDR cable test. The driver does not need to implement * all the parameters, but should report what is actually used. * All distances are in centimeters. */ struct phy_tdr_config { u32 first; u32 last; u32 step; s8 pair; }; #define PHY_PAIR_ALL -1 /** * struct phy_driver - Driver structure for a particular PHY type * * @mdiodrv: Data common to all MDIO devices * @phy_id: The result of reading the UID registers of this PHY * type, and ANDing them with the phy_id_mask. This driver * only works for PHYs with IDs which match this field * @name: The friendly name of this PHY type * @phy_id_mask: Defines the important bits of the phy_id * @features: A mandatory list of features (speed, duplex, etc) * supported by this PHY * @flags: A bitfield defining certain other features this PHY * supports (like interrupts) * @driver_data: Static driver data * * All functions are optional. If config_aneg or read_status * are not implemented, the phy core uses the genphy versions. * Note that none of these functions should be called from * interrupt time. The goal is for the bus read/write functions * to be able to block when the bus transaction is happening, * and be freed up by an interrupt (The MPC85xx has this ability, * though it is not currently supported in the driver). */ struct phy_driver { struct mdio_driver_common mdiodrv; u32 phy_id; char *name; u32 phy_id_mask; const unsigned long * const features; u32 flags; const void *driver_data; /** * @soft_reset: Called to issue a PHY software reset */ int (*soft_reset)(struct phy_device *phydev); /** * @config_init: Called to initialize the PHY, * including after a reset */ int (*config_init)(struct phy_device *phydev); /** * @probe: Called during discovery. Used to set * up device-specific structures, if any */ int (*probe)(struct phy_device *phydev); /** * @get_features: Probe the hardware to determine what * abilities it has. Should only set phydev->supported. */ int (*get_features)(struct phy_device *phydev); /* PHY Power Management */ /** @suspend: Suspend the hardware, saving state if needed */ int (*suspend)(struct phy_device *phydev); /** @resume: Resume the hardware, restoring state if needed */ int (*resume)(struct phy_device *phydev); /** * @config_aneg: Configures the advertisement and resets * autonegotiation if phydev->autoneg is on, * forces the speed to the current settings in phydev * if phydev->autoneg is off */ int (*config_aneg)(struct phy_device *phydev); /** @aneg_done: Determines the auto negotiation result */ int (*aneg_done)(struct phy_device *phydev); /** @read_status: Determines the negotiated speed and duplex */ int (*read_status)(struct phy_device *phydev); /** @ack_interrupt: Clears any pending interrupts */ int (*ack_interrupt)(struct phy_device *phydev); /** @config_intr: Enables or disables interrupts */ int (*config_intr)(struct phy_device *phydev); /** * @did_interrupt: Checks if the PHY generated an interrupt. * For multi-PHY devices with shared PHY interrupt pin * Set interrupt bits have to be cleared. */ int (*did_interrupt)(struct phy_device *phydev); /** @handle_interrupt: Override default interrupt handling */ irqreturn_t (*handle_interrupt)(struct phy_device *phydev); /** @remove: Clears up any memory if needed */ void (*remove)(struct phy_device *phydev); /** * @match_phy_device: Returns true if this is a suitable * driver for the given phydev. If NULL, matching is based on * phy_id and phy_id_mask. */ int (*match_phy_device)(struct phy_device *phydev); /** * @set_wol: Some devices (e.g. qnap TS-119P II) require PHY * register changes to enable Wake on LAN, so set_wol is * provided to be called in the ethernet driver's set_wol * function. */ int (*set_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @get_wol: See set_wol, but for checking whether Wake on LAN * is enabled. */ void (*get_wol)(struct phy_device *dev, struct ethtool_wolinfo *wol); /** * @link_change_notify: Called to inform a PHY device driver * when the core is about to change the link state. This * callback is supposed to be used as fixup hook for drivers * that need to take action when the link state * changes. Drivers are by no means allowed to mess with the * PHY device structure in their implementations. */ void (*link_change_notify)(struct phy_device *dev); /** * @read_mmd: PHY specific driver override for reading a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD read function * will be used by phy_read_mmd(), which will use either a * direct read for Clause 45 PHYs or an indirect read for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. */ int (*read_mmd)(struct phy_device *dev, int devnum, u16 regnum); /** * @write_mmd: PHY specific driver override for writing a MMD * register. This function is optional for PHY specific * drivers. When not provided, the default MMD write function * will be used by phy_write_mmd(), which will use either a * direct write for Clause 45 PHYs, or an indirect write for * Clause 22 PHYs. devnum is the MMD device number within the * PHY device, regnum is the register within the selected MMD * device. val is the valu